air <- read.csv("listings-complete.csv")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
colnames(air)
## [1] "id"
## [2] "listing_url"
## [3] "scrape_id"
## [4] "last_scraped"
## [5] "source"
## [6] "name"
## [7] "description"
## [8] "neighborhood_overview"
## [9] "picture_url"
## [10] "host_id"
## [11] "host_url"
## [12] "host_name"
## [13] "host_since"
## [14] "host_location"
## [15] "host_about"
## [16] "host_response_time"
## [17] "host_response_rate"
## [18] "host_acceptance_rate"
## [19] "host_is_superhost"
## [20] "host_thumbnail_url"
## [21] "host_picture_url"
## [22] "host_neighbourhood"
## [23] "host_listings_count"
## [24] "host_total_listings_count"
## [25] "host_verifications"
## [26] "host_has_profile_pic"
## [27] "host_identity_verified"
## [28] "neighbourhood"
## [29] "neighbourhood_cleansed"
## [30] "neighbourhood_group_cleansed"
## [31] "latitude"
## [32] "longitude"
## [33] "property_type"
## [34] "room_type"
## [35] "accommodates"
## [36] "bathrooms"
## [37] "bathrooms_text"
## [38] "bedrooms"
## [39] "beds"
## [40] "amenities"
## [41] "price"
## [42] "minimum_nights"
## [43] "maximum_nights"
## [44] "minimum_minimum_nights"
## [45] "maximum_minimum_nights"
## [46] "minimum_maximum_nights"
## [47] "maximum_maximum_nights"
## [48] "minimum_nights_avg_ntm"
## [49] "maximum_nights_avg_ntm"
## [50] "calendar_updated"
## [51] "has_availability"
## [52] "availability_30"
## [53] "availability_60"
## [54] "availability_90"
## [55] "availability_365"
## [56] "calendar_last_scraped"
## [57] "number_of_reviews"
## [58] "number_of_reviews_ltm"
## [59] "number_of_reviews_l30d"
## [60] "first_review"
## [61] "last_review"
## [62] "review_scores_rating"
## [63] "review_scores_accuracy"
## [64] "review_scores_cleanliness"
## [65] "review_scores_checkin"
## [66] "review_scores_communication"
## [67] "review_scores_location"
## [68] "review_scores_value"
## [69] "license"
## [70] "instant_bookable"
## [71] "calculated_host_listings_count"
## [72] "calculated_host_listings_count_entire_homes"
## [73] "calculated_host_listings_count_private_rooms"
## [74] "calculated_host_listings_count_shared_rooms"
## [75] "reviews_per_month"
# Convert character columns to factors in the dataset
air <- air %>% mutate_if(is.character, as.factor)
# Extract numbers from "bathrooms_text" and convert to numeric
air$bathrooms <- as.numeric(gsub("[^0-9.]+", "", air$bathrooms_text))
# Define the selected variables for analysis
selected_variables <- c(
"host_is_superhost", "host_response_time", "host_response_rate",
"host_acceptance_rate", "accommodates",
"bathrooms", "bedrooms", "price", "neighbourhood_cleansed",
"host_listings_count", "minimum_nights", "maximum_nights",
"instant_bookable", "host_identity_verified", "availability_30",
"availability_60", "availability_90","review_scores_rating","reviews_per_month", "has_availability","availability_365","number_of_reviews"
)
# Create a new dataframe with only the selected variables
air_new <- air[selected_variables]
# Remove rows with NA values from air_new
air_clean <- na.omit(air_new)
# Check the structure of air_clean
str(air_clean)
## 'data.frame': 4645 obs. of 22 variables:
## $ host_is_superhost : Factor w/ 2 levels "f","t": 2 2 1 1 1 1 1 1 1 2 ...
## $ host_response_time : Factor w/ 5 levels "a few days or more",..: 5 5 2 4 2 1 2 2 2 5 ...
## $ host_response_rate : Factor w/ 41 levels "0%","100%","11%",..: 2 2 41 2 41 1 41 41 41 2 ...
## $ host_acceptance_rate : Factor w/ 79 levels "0%","100%","11%",..: 75 70 79 72 2 79 1 79 79 2 ...
## $ accommodates : int 2 2 4 2 4 6 7 2 2 2 ...
## $ bathrooms : num 1 1 1 1 1 1 3 1 1 1 ...
## $ bedrooms : int 1 1 1 1 2 3 4 1 1 1 ...
## $ price : Factor w/ 566 levels "$1,000.00","$1,029.00",..: 90 83 83 42 294 30 128 551 412 39 ...
## $ neighbourhood_cleansed: Factor w/ 23 levels "Arbutus Ridge",..: 22 8 8 3 7 6 15 13 13 6 ...
## $ host_listings_count : int 1 3 1 4 1 5 1 1 1 1 ...
## $ minimum_nights : int 2 30 3 30 3 365 5 5 30 1 ...
## $ maximum_nights : int 90 180 7 1125 31 365 29 60 1125 40 ...
## $ instant_bookable : Factor w/ 2 levels "f","t": 1 1 2 1 1 1 1 1 1 1 ...
## $ host_identity_verified: Factor w/ 2 levels "f","t": 2 2 2 2 2 2 2 2 2 2 ...
## $ availability_30 : int 18 0 0 1 0 30 10 0 17 0 ...
## $ availability_60 : int 22 5 0 1 0 60 10 0 35 0 ...
## $ availability_90 : int 34 6 0 1 0 90 10 0 54 0 ...
## $ review_scores_rating : num 4.68 4.92 4.76 4.69 4.57 4 5 4.6 4.54 4.98 ...
## $ reviews_per_month : num 2.86 0.67 0.22 1.66 0.12 0.05 0.05 1.6 0.81 3.25 ...
## $ has_availability : Factor w/ 2 levels "f","t": 2 2 2 2 2 2 2 2 2 2 ...
## $ availability_365 : int 212 152 0 70 194 90 19 0 226 0 ...
## $ number_of_reviews : int 443 96 34 265 7 3 7 203 118 466 ...
## - attr(*, "na.action")= 'omit' Named int [1:1330] 9 23 72 77 83 86 87 92 93 94 ...
## ..- attr(*, "names")= chr [1:1330] "9" "23" "72" "77" ...
# Convert "price" column to numeric
air_clean$price <- as.numeric(gsub("[$,]", "", air_clean$price))
# Convert "host_acceptance_rate" and "host_response_rate" columns to numeric
air_clean$host_acceptance_rate <- as.numeric(gsub("%", "", air_clean$host_acceptance_rate)) / 100
## Warning: NAs introduced by coercion
air_clean$host_response_rate <- as.numeric(gsub("%", "", air_clean$host_response_rate)) / 100
## Warning: NAs introduced by coercion
# Check the structure of air_clean again
str(air_clean)
## 'data.frame': 4645 obs. of 22 variables:
## $ host_is_superhost : Factor w/ 2 levels "f","t": 2 2 1 1 1 1 1 1 1 2 ...
## $ host_response_time : Factor w/ 5 levels "a few days or more",..: 5 5 2 4 2 1 2 2 2 5 ...
## $ host_response_rate : num 1 1 NA 1 NA 0 NA NA NA 1 ...
## $ host_acceptance_rate : num 0.96 0.91 NA 0.93 1 NA 0 NA NA 1 ...
## $ accommodates : int 2 2 4 2 4 6 7 2 2 2 ...
## $ bathrooms : num 1 1 1 1 1 1 3 1 1 1 ...
## $ bedrooms : int 1 1 1 1 2 3 4 1 1 1 ...
## $ price : num 157 150 150 110 350 100 195 94 51 109 ...
## $ neighbourhood_cleansed: Factor w/ 23 levels "Arbutus Ridge",..: 22 8 8 3 7 6 15 13 13 6 ...
## $ host_listings_count : int 1 3 1 4 1 5 1 1 1 1 ...
## $ minimum_nights : int 2 30 3 30 3 365 5 5 30 1 ...
## $ maximum_nights : int 90 180 7 1125 31 365 29 60 1125 40 ...
## $ instant_bookable : Factor w/ 2 levels "f","t": 1 1 2 1 1 1 1 1 1 1 ...
## $ host_identity_verified: Factor w/ 2 levels "f","t": 2 2 2 2 2 2 2 2 2 2 ...
## $ availability_30 : int 18 0 0 1 0 30 10 0 17 0 ...
## $ availability_60 : int 22 5 0 1 0 60 10 0 35 0 ...
## $ availability_90 : int 34 6 0 1 0 90 10 0 54 0 ...
## $ review_scores_rating : num 4.68 4.92 4.76 4.69 4.57 4 5 4.6 4.54 4.98 ...
## $ reviews_per_month : num 2.86 0.67 0.22 1.66 0.12 0.05 0.05 1.6 0.81 3.25 ...
## $ has_availability : Factor w/ 2 levels "f","t": 2 2 2 2 2 2 2 2 2 2 ...
## $ availability_365 : int 212 152 0 70 194 90 19 0 226 0 ...
## $ number_of_reviews : int 443 96 34 265 7 3 7 203 118 466 ...
## - attr(*, "na.action")= 'omit' Named int [1:1330] 9 23 72 77 83 86 87 92 93 94 ...
## ..- attr(*, "names")= chr [1:1330] "9" "23" "72" "77" ...
# Summarize the cleaned dataset
summary(air_clean)
## host_is_superhost host_response_time host_response_rate
## f:2788 a few days or more: 45 Min. :0.0000
## t:1857 N/A : 807 1st Qu.:1.0000
## within a day : 230 Median :1.0000
## within a few hours: 618 Mean :0.9746
## within an hour :2945 3rd Qu.:1.0000
## Max. :1.0000
## NA's :807
## host_acceptance_rate accommodates bathrooms bedrooms
## Min. :0.0000 Min. : 1.000 Min. :0.000 Min. : 1.000
## 1st Qu.:0.9100 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 1.000
## Median :0.9900 Median : 3.000 Median :1.000 Median : 1.000
## Mean :0.9044 Mean : 3.597 Mean :1.333 Mean : 1.612
## 3rd Qu.:1.0000 3rd Qu.: 4.000 3rd Qu.:1.500 3rd Qu.: 2.000
## Max. :1.0000 Max. :16.000 Max. :7.000 Max. :10.000
## NA's :568
## price neighbourhood_cleansed host_listings_count
## Min. : 14.0 Downtown :1236 Min. : 1.000
## 1st Qu.: 105.0 Kitsilano : 369 1st Qu.: 1.000
## Median : 150.0 West End : 340 Median : 1.000
## Mean : 190.6 Kensington-Cedar Cottage: 305 Mean : 8.332
## 3rd Qu.: 216.0 Mount Pleasant : 279 3rd Qu.: 4.000
## Max. :9888.0 Downtown Eastside : 253 Max. :513.000
## (Other) :1863
## minimum_nights maximum_nights instant_bookable host_identity_verified
## Min. : 1.00 Min. : 1 f:3540 f: 262
## 1st Qu.: 2.00 1st Qu.: 90 t:1105 t:4383
## Median : 3.00 Median : 365
## Mean : 14.22 Mean : 569
## 3rd Qu.: 30.00 3rd Qu.:1125
## Max. :399.00 Max. :1125
##
## availability_30 availability_60 availability_90 review_scores_rating
## Min. : 0.000 Min. : 0.00 Min. : 0.00 Min. :0.000
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 4.00 1st Qu.:4.710
## Median : 6.000 Median :18.00 Median :31.00 Median :4.870
## Mean : 8.989 Mean :21.92 Mean :35.15 Mean :4.754
## 3rd Qu.:15.000 3rd Qu.:38.00 3rd Qu.:61.00 3rd Qu.:5.000
## Max. :30.000 Max. :60.00 Max. :90.00 Max. :5.000
##
## reviews_per_month has_availability availability_365 number_of_reviews
## Min. : 0.01 f: 30 Min. : 0 Min. : 1.00
## 1st Qu.: 0.31 t:4615 1st Qu.: 29 1st Qu.: 5.00
## Median : 1.00 Median : 97 Median : 17.00
## Mean : 1.64 Mean :130 Mean : 44.77
## 3rd Qu.: 2.55 3rd Qu.:217 3rd Qu.: 55.00
## Max. :11.15 Max. :365 Max. :863.00
##
# Calculate the count of bathrooms
bathrooms_count <- table(air_clean$bathrooms)
library(ggplot2)
p = ggplot(air_clean, aes(y = air_clean$price, x = air_clean$accommodates)) + geom_point() + facet_wrap(~air_clean$bedrooms)
print(p + labs(title = "Price compared to accommodation size split by bedrooms (Model 1)", y = "Price", x = "Accommodates"))
p = ggplot(air_clean, aes(y = air_clean$price, x = air_clean$accommodates)) + geom_point() + facet_wrap(~air_clean$bathrooms)
print(p + labs(title = "Price compared to accommodation size split by bathrooms (Model 1)", y = "Price", x = "Accommodates"))
p = ggplot(air_clean, aes(y = air_clean$price, x = air_clean$bathrooms)) + geom_point()
print(p + labs(title = "Price compared to bathrooms (Model 1)", y = "Price", x = "Bathrooms"))
p = ggplot(air_clean, aes(y = air_clean$price, x = air_clean$number_of_reviews)) + geom_point()
print(p + labs(title = "Price compared to the number of reviews (Model 1)", y = "Price", x = "Number of reviews"))
# Calculate correlation matrix
correlation_matrix <- model.matrix(~0 + ., air_clean) %>%
cor(use = "pairwise.complete.obs")
## Warning in cor(., use = "pairwise.complete.obs"): the standard deviation is
## zero
# Identify correlations with the "y" variable (price)
review_correlations <- correlation_matrix["price",]
# Sort correlations in descending order to identify predictors
sorted_correlations <- sort(review_correlations, decreasing = TRUE)
# Convert sorted correlations to a data frame
correlation_df <- data.frame(Predictor_Variable = names(sorted_correlations), Correlation_with_y = sorted_correlations)
options(scipen = 999)
# Print the sorted correlations table
print(correlation_df)
## Predictor_Variable
## price price
## bathrooms bathrooms
## bedrooms bedrooms
## accommodates accommodates
## review_scores_rating review_scores_rating
## neighbourhood_cleansedDowntown neighbourhood_cleansedDowntown
## neighbourhood_cleansedWest Point Grey neighbourhood_cleansedWest Point Grey
## availability_30 availability_30
## availability_60 availability_60
## availability_365 availability_365
## neighbourhood_cleansedKitsilano neighbourhood_cleansedKitsilano
## availability_90 availability_90
## neighbourhood_cleansedMount Pleasant neighbourhood_cleansedMount Pleasant
## neighbourhood_cleansedOakridge neighbourhood_cleansedOakridge
## maximum_nights maximum_nights
## host_acceptance_rate host_acceptance_rate
## host_is_superhostt host_is_superhostt
## minimum_nights minimum_nights
## host_listings_count host_listings_count
## neighbourhood_cleansedWest End neighbourhood_cleansedWest End
## neighbourhood_cleansedKerrisdale neighbourhood_cleansedKerrisdale
## host_response_timewithin an hour host_response_timewithin an hour
## host_response_timewithin a day host_response_timewithin a day
## instant_bookablet instant_bookablet
## host_response_timewithin a few hours host_response_timewithin a few hours
## has_availabilityt has_availabilityt
## neighbourhood_cleansedShaughnessy neighbourhood_cleansedShaughnessy
## neighbourhood_cleansedStrathcona neighbourhood_cleansedStrathcona
## neighbourhood_cleansedDowntown Eastside neighbourhood_cleansedDowntown Eastside
## neighbourhood_cleansedDunbar Southlands neighbourhood_cleansedDunbar Southlands
## neighbourhood_cleansedFairview neighbourhood_cleansedFairview
## neighbourhood_cleansedRiley Park neighbourhood_cleansedRiley Park
## host_is_superhostf host_is_superhostf
## host_response_rate host_response_rate
## neighbourhood_cleansedGrandview-Woodland neighbourhood_cleansedGrandview-Woodland
## neighbourhood_cleansedSouth Cambie neighbourhood_cleansedSouth Cambie
## neighbourhood_cleansedSunset neighbourhood_cleansedSunset
## neighbourhood_cleansedKillarney neighbourhood_cleansedKillarney
## neighbourhood_cleansedKensington-Cedar Cottage neighbourhood_cleansedKensington-Cedar Cottage
## neighbourhood_cleansedVictoria-Fraserview neighbourhood_cleansedVictoria-Fraserview
## reviews_per_month reviews_per_month
## neighbourhood_cleansedMarpole neighbourhood_cleansedMarpole
## host_identity_verifiedt host_identity_verifiedt
## neighbourhood_cleansedHastings-Sunrise neighbourhood_cleansedHastings-Sunrise
## neighbourhood_cleansedRenfrew-Collingwood neighbourhood_cleansedRenfrew-Collingwood
## number_of_reviews number_of_reviews
## Correlation_with_y
## price 1.000000000
## bathrooms 0.395649452
## bedrooms 0.395114285
## accommodates 0.380550792
## review_scores_rating 0.071042539
## neighbourhood_cleansedDowntown 0.060304372
## neighbourhood_cleansedWest Point Grey 0.058879237
## availability_30 0.055708574
## availability_60 0.048433605
## availability_365 0.048410249
## neighbourhood_cleansedKitsilano 0.045056580
## availability_90 0.038783855
## neighbourhood_cleansedMount Pleasant 0.026480149
## neighbourhood_cleansedOakridge 0.024402039
## maximum_nights 0.021202133
## host_acceptance_rate 0.019713798
## host_is_superhostt 0.018904712
## minimum_nights 0.013038382
## host_listings_count 0.011821623
## neighbourhood_cleansedWest End 0.004889116
## neighbourhood_cleansedKerrisdale 0.004336057
## host_response_timewithin an hour -0.001229694
## host_response_timewithin a day -0.001977631
## instant_bookablet -0.002355934
## host_response_timewithin a few hours -0.002934061
## has_availabilityt -0.003912086
## neighbourhood_cleansedShaughnessy -0.006306109
## neighbourhood_cleansedStrathcona -0.006593262
## neighbourhood_cleansedDowntown Eastside -0.006655020
## neighbourhood_cleansedDunbar Southlands -0.008912073
## neighbourhood_cleansedFairview -0.013810626
## neighbourhood_cleansedRiley Park -0.014944620
## host_is_superhostf -0.018904712
## host_response_rate -0.020247048
## neighbourhood_cleansedGrandview-Woodland -0.020587907
## neighbourhood_cleansedSouth Cambie -0.021280211
## neighbourhood_cleansedSunset -0.021825216
## neighbourhood_cleansedKillarney -0.024146890
## neighbourhood_cleansedKensington-Cedar Cottage -0.030649981
## neighbourhood_cleansedVictoria-Fraserview -0.030908505
## reviews_per_month -0.036048000
## neighbourhood_cleansedMarpole -0.042170158
## host_identity_verifiedt -0.052983328
## neighbourhood_cleansedHastings-Sunrise -0.053412240
## neighbourhood_cleansedRenfrew-Collingwood -0.057621921
## number_of_reviews -0.058361420
#install.packages("plotly")
require(plotly)
## Loading required package: plotly
## Warning: package 'plotly' was built under R version 4.3.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
m <- list(
l = 10,
r = 10,
b = 10,
t = 10,
pad = 1
)
heatmap <- plot_ly(x=colnames(correlation_matrix), y=rownames(correlation_matrix), z = correlation_matrix, type="heatmap",
colors=colorRamp(c("darkblue","white","darkred"))) %>%
layout(margin = m)
#save graph as an html
#htmlwidgets::saveWidget(as_widget(heatmap), "heatmap.html")
heatmap
This is the kitchen sink model of OLS. This includes all of variables without any data transformations.
# Load the required library for linear regression
library(stats)
# Define the linear regression model using selected variables
model <- lm(price ~ host_is_superhost + host_response_time + host_response_rate + host_acceptance_rate +
accommodates + bathrooms + bedrooms + neighbourhood_cleansed + host_listings_count +
minimum_nights + maximum_nights + instant_bookable + host_identity_verified +
availability_30 + availability_60 + availability_90 + review_scores_rating + reviews_per_month + has_availability,
data = air_clean)
# Print the summary of the model to check coefficients and other statistics
summary(model)
##
## Call:
## lm(formula = price ~ host_is_superhost + host_response_time +
## host_response_rate + host_acceptance_rate + accommodates +
## bathrooms + bedrooms + neighbourhood_cleansed + host_listings_count +
## minimum_nights + maximum_nights + instant_bookable + host_identity_verified +
## availability_30 + availability_60 + availability_90 + review_scores_rating +
## reviews_per_month + has_availability, data = air_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -594.8 -53.8 -11.9 29.0 8402.0
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -68.614897 80.456209 -0.853
## host_is_superhostt 14.380850 6.725793 2.138
## host_response_timewithin a day -41.441943 52.046970 -0.796
## host_response_timewithin a few hours -20.990902 55.245538 -0.380
## host_response_timewithin an hour -31.802305 55.852283 -0.569
## host_response_rate -19.117751 57.747157 -0.331
## host_acceptance_rate 28.529234 21.917586 1.302
## accommodates 14.191734 2.955346 4.802
## bathrooms 62.416952 7.547781 8.270
## bedrooms 36.662354 7.413905 4.945
## neighbourhood_cleansedDowntown 20.579325 26.927630 0.764
## neighbourhood_cleansedDowntown Eastside 3.340701 29.384469 0.114
## neighbourhood_cleansedDunbar Southlands -32.701465 31.968067 -1.023
## neighbourhood_cleansedFairview -4.612242 33.123843 -0.139
## neighbourhood_cleansedGrandview-Woodland -29.375280 30.776075 -0.954
## neighbourhood_cleansedHastings-Sunrise -57.948567 30.495717 -1.900
## neighbourhood_cleansedKensington-Cedar Cottage -37.073429 29.047622 -1.276
## neighbourhood_cleansedKerrisdale -60.591455 37.450737 -1.618
## neighbourhood_cleansedKillarney -79.350043 37.557899 -2.113
## neighbourhood_cleansedKitsilano 24.905495 28.393454 0.877
## neighbourhood_cleansedMarpole -64.347936 32.738124 -1.966
## neighbourhood_cleansedMount Pleasant 31.366633 29.036472 1.080
## neighbourhood_cleansedOakridge -4.108600 35.524962 -0.116
## neighbourhood_cleansedRenfrew-Collingwood -51.344649 30.056500 -1.708
## neighbourhood_cleansedRiley Park -21.267632 29.282984 -0.726
## neighbourhood_cleansedShaughnessy -14.297555 35.721561 -0.400
## neighbourhood_cleansedSouth Cambie -24.227784 37.262393 -0.650
## neighbourhood_cleansedStrathcona -6.936857 46.625311 -0.149
## neighbourhood_cleansedSunset -55.621589 33.615909 -1.655
## neighbourhood_cleansedVictoria-Fraserview -93.116159 34.748721 -2.680
## neighbourhood_cleansedWest End 18.671541 28.765937 0.649
## neighbourhood_cleansedWest Point Grey 74.206770 36.685625 2.023
## host_listings_count 0.035374 0.100269 0.353
## minimum_nights 0.227415 0.153342 1.483
## maximum_nights 0.001765 0.006622 0.267
## instant_bookablet -8.478193 7.538493 -1.125
## host_identity_verifiedt -64.831607 17.255350 -3.757
## availability_30 1.700694 0.768408 2.213
## availability_60 -0.345842 0.757097 -0.457
## availability_90 0.076607 0.379661 0.202
## review_scores_rating 33.004020 7.778605 4.243
## reviews_per_month -5.164100 2.054387 -2.514
## has_availabilityt -15.623611 54.670534 -0.286
## Pr(>|t|)
## (Intercept) 0.393812
## host_is_superhostt 0.032567 *
## host_response_timewithin a day 0.425942
## host_response_timewithin a few hours 0.703999
## host_response_timewithin an hour 0.569118
## host_response_rate 0.740618
## host_acceptance_rate 0.193112
## accommodates 0.000001631 ***
## bathrooms < 0.0000000000000002 ***
## bedrooms 0.000000794 ***
## neighbourhood_cleansedDowntown 0.444769
## neighbourhood_cleansedDowntown Eastside 0.909490
## neighbourhood_cleansedDunbar Southlands 0.306401
## neighbourhood_cleansedFairview 0.889266
## neighbourhood_cleansedGrandview-Woodland 0.339900
## neighbourhood_cleansedHastings-Sunrise 0.057480 .
## neighbourhood_cleansedKensington-Cedar Cottage 0.201929
## neighbourhood_cleansedKerrisdale 0.105768
## neighbourhood_cleansedKillarney 0.034689 *
## neighbourhood_cleansedKitsilano 0.380458
## neighbourhood_cleansedMarpole 0.049426 *
## neighbourhood_cleansedMount Pleasant 0.280100
## neighbourhood_cleansedOakridge 0.907933
## neighbourhood_cleansedRenfrew-Collingwood 0.087668 .
## neighbourhood_cleansedRiley Park 0.467712
## neighbourhood_cleansedShaughnessy 0.688995
## neighbourhood_cleansedSouth Cambie 0.515606
## neighbourhood_cleansedStrathcona 0.881736
## neighbourhood_cleansedSunset 0.098085 .
## neighbourhood_cleansedVictoria-Fraserview 0.007401 **
## neighbourhood_cleansedWest End 0.516323
## neighbourhood_cleansedWest Point Grey 0.043167 *
## host_listings_count 0.724261
## minimum_nights 0.138142
## maximum_nights 0.789848
## instant_bookablet 0.260807
## host_identity_verifiedt 0.000174 ***
## availability_30 0.026939 *
## availability_60 0.647841
## availability_90 0.840101
## review_scores_rating 0.000022592 ***
## reviews_per_month 0.011989 *
## has_availabilityt 0.775064
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 188 on 3775 degrees of freedom
## (827 observations deleted due to missingness)
## Multiple R-squared: 0.2236, Adjusted R-squared: 0.2149
## F-statistic: 25.88 on 42 and 3775 DF, p-value: < 0.00000000000000022
# Remove rows with missing values
air_clean <- na.omit(air_clean)
# Perform stepwise selection
step_model <- step(model)
## Start: AIC=40029.87
## price ~ host_is_superhost + host_response_time + host_response_rate +
## host_acceptance_rate + accommodates + bathrooms + bedrooms +
## neighbourhood_cleansed + host_listings_count + minimum_nights +
## maximum_nights + instant_bookable + host_identity_verified +
## availability_30 + availability_60 + availability_90 + review_scores_rating +
## reviews_per_month + has_availability
##
## Df Sum of Sq RSS AIC
## - host_response_time 3 103632 133584511 40027
## - availability_90 1 1440 133482318 40028
## - maximum_nights 1 2512 133483390 40028
## - has_availability 1 2888 133483766 40028
## - host_response_rate 1 3875 133484754 40028
## - host_listings_count 1 4401 133485280 40028
## - availability_60 1 7378 133488257 40028
## - instant_bookable 1 44724 133525602 40029
## - host_acceptance_rate 1 59910 133540788 40030
## <none> 133480879 40030
## - minimum_nights 1 77771 133558650 40030
## - host_is_superhost 1 161653 133642532 40032
## - availability_30 1 173209 133654088 40033
## - reviews_per_month 1 223423 133704301 40034
## - host_identity_verified 1 499147 133980025 40042
## - review_scores_rating 1 636550 134117428 40046
## - accommodates 1 815373 134296252 40051
## - bedrooms 1 864667 134345545 40053
## - bathrooms 1 2418068 135898947 40096
## - neighbourhood_cleansed 22 4277486 137758365 40106
##
## Step: AIC=40026.83
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + maximum_nights + instant_bookable +
## host_identity_verified + availability_30 + availability_60 +
## availability_90 + review_scores_rating + reviews_per_month +
## has_availability
##
## Df Sum of Sq RSS AIC
## - availability_90 1 1249 133585759 40025
## - maximum_nights 1 1274 133585785 40025
## - has_availability 1 2765 133587275 40025
## - availability_60 1 7121 133591632 40025
## - host_listings_count 1 7176 133591686 40025
## - host_response_rate 1 48400 133632910 40026
## - instant_bookable 1 50791 133635302 40026
## - host_acceptance_rate 1 57199 133641710 40026
## <none> 133584511 40027
## - minimum_nights 1 80187 133664697 40027
## - host_is_superhost 1 165439 133749949 40030
## - availability_30 1 168454 133752964 40030
## - reviews_per_month 1 246044 133830555 40032
## - host_identity_verified 1 491348 134075858 40039
## - review_scores_rating 1 664729 134249239 40044
## - accommodates 1 801141 134385651 40048
## - bedrooms 1 870305 134454816 40050
## - bathrooms 1 2408446 135992957 40093
## - neighbourhood_cleansed 22 4255500 137840011 40103
##
## Step: AIC=40024.86
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + maximum_nights + instant_bookable +
## host_identity_verified + availability_30 + availability_60 +
## review_scores_rating + reviews_per_month + has_availability
##
## Df Sum of Sq RSS AIC
## - maximum_nights 1 1350 133587109 40023
## - has_availability 1 2821 133588581 40023
## - host_listings_count 1 7547 133593307 40023
## - availability_60 1 12835 133598594 40023
## - host_response_rate 1 48995 133634755 40024
## - instant_bookable 1 50699 133636459 40024
## - host_acceptance_rate 1 58179 133643939 40025
## <none> 133585759 40025
## - minimum_nights 1 81354 133667114 40025
## - host_is_superhost 1 164325 133750084 40028
## - availability_30 1 187114 133772874 40028
## - reviews_per_month 1 246986 133832745 40030
## - host_identity_verified 1 491180 134076939 40037
## - review_scores_rating 1 664541 134250300 40042
## - accommodates 1 802810 134388569 40046
## - bedrooms 1 869188 134454947 40048
## - bathrooms 1 2407454 135993214 40091
## - neighbourhood_cleansed 22 4254649 137840408 40101
##
## Step: AIC=40022.9
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + instant_bookable +
## host_identity_verified + availability_30 + availability_60 +
## review_scores_rating + reviews_per_month + has_availability
##
## Df Sum of Sq RSS AIC
## - has_availability 1 2752 133589862 40021
## - host_listings_count 1 7540 133594650 40021
## - availability_60 1 12710 133599820 40021
## - host_response_rate 1 48941 133636051 40022
## - instant_bookable 1 51188 133638297 40022
## - host_acceptance_rate 1 58450 133645559 40023
## <none> 133587109 40023
## - minimum_nights 1 84322 133671431 40023
## - host_is_superhost 1 166379 133753488 40026
## - availability_30 1 186339 133773448 40026
## - reviews_per_month 1 249971 133837081 40028
## - host_identity_verified 1 490176 134077285 40035
## - review_scores_rating 1 663193 134250302 40040
## - accommodates 1 803547 134390657 40044
## - bedrooms 1 871136 134458245 40046
## - bathrooms 1 2406207 135993316 40089
## - neighbourhood_cleansed 22 4269346 137856455 40099
##
## Step: AIC=40020.98
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + instant_bookable +
## host_identity_verified + availability_30 + availability_60 +
## review_scores_rating + reviews_per_month
##
## Df Sum of Sq RSS AIC
## - host_listings_count 1 7540 133597402 40019
## - availability_60 1 12735 133602597 40019
## - host_response_rate 1 48770 133638632 40020
## - instant_bookable 1 51999 133641861 40020
## - host_acceptance_rate 1 58768 133648630 40021
## <none> 133589862 40021
## - minimum_nights 1 84056 133673917 40021
## - host_is_superhost 1 168174 133758036 40024
## - availability_30 1 186688 133776550 40024
## - reviews_per_month 1 251466 133841328 40026
## - host_identity_verified 1 489679 134079541 40033
## - review_scores_rating 1 661848 134251709 40038
## - accommodates 1 806231 134396092 40042
## - bedrooms 1 868584 134458446 40044
## - bathrooms 1 2412954 136002816 40087
## - neighbourhood_cleansed 22 4268184 137858045 40097
##
## Step: AIC=40019.2
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## minimum_nights + instant_bookable + host_identity_verified +
## availability_30 + availability_60 + review_scores_rating +
## reviews_per_month
##
## Df Sum of Sq RSS AIC
## - availability_60 1 11734 133609135 40018
## - host_response_rate 1 47650 133645052 40019
## - instant_bookable 1 48896 133646297 40019
## - host_acceptance_rate 1 59106 133656508 40019
## <none> 133597402 40019
## - minimum_nights 1 85555 133682957 40020
## - host_is_superhost 1 162204 133759606 40022
## - availability_30 1 183626 133781028 40022
## - reviews_per_month 1 259107 133856509 40025
## - host_identity_verified 1 485061 134082463 40031
## - review_scores_rating 1 656489 134253891 40036
## - accommodates 1 802342 134399744 40040
## - bedrooms 1 874892 134472293 40042
## - bathrooms 1 2411066 136008468 40085
## - neighbourhood_cleansed 22 4326652 137924054 40097
##
## Step: AIC=40017.53
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## minimum_nights + instant_bookable + host_identity_verified +
## availability_30 + review_scores_rating + reviews_per_month
##
## Df Sum of Sq RSS AIC
## - host_response_rate 1 46305 133655440 40017
## - instant_bookable 1 50771 133659906 40017
## - host_acceptance_rate 1 54992 133664127 40017
## <none> 133609135 40018
## - minimum_nights 1 84750 133693885 40018
## - host_is_superhost 1 165106 133774242 40020
## - reviews_per_month 1 265876 133875012 40023
## - host_identity_verified 1 489228 134098363 40029
## - availability_30 1 525995 134135130 40031
## - review_scores_rating 1 658725 134267861 40034
## - accommodates 1 799044 134408179 40038
## - bedrooms 1 877465 134486601 40041
## - bathrooms 1 2408466 136017601 40084
## - neighbourhood_cleansed 22 4322438 137931573 40095
##
## Step: AIC=40016.86
## price ~ host_is_superhost + host_acceptance_rate + accommodates +
## bathrooms + bedrooms + neighbourhood_cleansed + minimum_nights +
## instant_bookable + host_identity_verified + availability_30 +
## review_scores_rating + reviews_per_month
##
## Df Sum of Sq RSS AIC
## - host_acceptance_rate 1 30657 133686097 40016
## - instant_bookable 1 50515 133705955 40016
## <none> 133655440 40017
## - minimum_nights 1 83346 133738786 40017
## - host_is_superhost 1 158757 133814197 40019
## - reviews_per_month 1 270254 133925694 40023
## - host_identity_verified 1 495639 134151078 40029
## - availability_30 1 545372 134200812 40030
## - review_scores_rating 1 651864 134307304 40033
## - accommodates 1 798875 134454315 40038
## - bedrooms 1 876113 134531552 40040
## - bathrooms 1 2417322 136072762 40083
## - neighbourhood_cleansed 22 4355970 138011410 40095
##
## Step: AIC=40015.73
## price ~ host_is_superhost + accommodates + bathrooms + bedrooms +
## neighbourhood_cleansed + minimum_nights + instant_bookable +
## host_identity_verified + availability_30 + review_scores_rating +
## reviews_per_month
##
## Df Sum of Sq RSS AIC
## - instant_bookable 1 37896 133723993 40015
## <none> 133686097 40016
## - minimum_nights 1 73129 133759226 40016
## - host_is_superhost 1 192539 133878636 40019
## - reviews_per_month 1 245126 133931223 40021
## - host_identity_verified 1 500407 134186504 40028
## - availability_30 1 548591 134234688 40029
## - review_scores_rating 1 651690 134337787 40032
## - accommodates 1 828624 134514721 40037
## - bedrooms 1 870637 134556734 40039
## - bathrooms 1 2399211 136085308 40082
## - neighbourhood_cleansed 22 4334411 138020508 40094
##
## Step: AIC=40014.81
## price ~ host_is_superhost + accommodates + bathrooms + bedrooms +
## neighbourhood_cleansed + minimum_nights + host_identity_verified +
## availability_30 + review_scores_rating + reviews_per_month
##
## Df Sum of Sq RSS AIC
## <none> 133723993 40015
## - minimum_nights 1 77720 133801713 40015
## - host_is_superhost 1 208509 133932502 40019
## - reviews_per_month 1 269104 133993097 40020
## - host_identity_verified 1 495765 134219758 40027
## - availability_30 1 532607 134256600 40028
## - review_scores_rating 1 681435 134405428 40032
## - accommodates 1 813727 134537720 40036
## - bedrooms 1 881805 134605798 40038
## - bathrooms 1 2397946 136121939 40081
## - neighbourhood_cleansed 22 4318983 138042976 40092
# Summary of the model with stepwise selection
summary(step_model)
##
## Call:
## lm(formula = price ~ host_is_superhost + accommodates + bathrooms +
## bedrooms + neighbourhood_cleansed + minimum_nights + host_identity_verified +
## availability_30 + review_scores_rating + reviews_per_month,
## data = air_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -591.5 -53.0 -12.8 29.5 8411.0
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -112.0849 49.0409 -2.286
## host_is_superhostt 15.7748 6.4926 2.430
## accommodates 14.0895 2.9354 4.800
## bathrooms 62.0460 7.5302 8.240
## bedrooms 36.9237 7.3898 4.997
## neighbourhood_cleansedDowntown 19.7935 26.8794 0.736
## neighbourhood_cleansedDowntown Eastside 4.1203 29.3261 0.140
## neighbourhood_cleansedDunbar Southlands -33.0215 31.8826 -1.036
## neighbourhood_cleansedFairview -4.7947 33.0391 -0.145
## neighbourhood_cleansedGrandview-Woodland -29.9364 30.6737 -0.976
## neighbourhood_cleansedHastings-Sunrise -57.0590 30.4421 -1.874
## neighbourhood_cleansedKensington-Cedar Cottage -38.7957 28.9638 -1.339
## neighbourhood_cleansedKerrisdale -60.4145 37.4025 -1.615
## neighbourhood_cleansedKillarney -80.8849 37.4822 -2.158
## neighbourhood_cleansedKitsilano 23.9052 28.3344 0.844
## neighbourhood_cleansedMarpole -65.3931 32.7020 -2.000
## neighbourhood_cleansedMount Pleasant 31.4575 29.0042 1.085
## neighbourhood_cleansedOakridge -5.1763 35.4228 -0.146
## neighbourhood_cleansedRenfrew-Collingwood -52.4513 29.9687 -1.750
## neighbourhood_cleansedRiley Park -20.5983 29.2187 -0.705
## neighbourhood_cleansedShaughnessy -20.1101 35.3966 -0.568
## neighbourhood_cleansedSouth Cambie -24.6347 37.1773 -0.663
## neighbourhood_cleansedStrathcona -8.0929 46.5123 -0.174
## neighbourhood_cleansedSunset -56.7506 33.5279 -1.693
## neighbourhood_cleansedVictoria-Fraserview -92.5127 34.6655 -2.669
## neighbourhood_cleansedWest End 18.6769 28.6604 0.652
## neighbourhood_cleansedWest Point Grey 70.9888 36.6270 1.938
## minimum_nights 0.2237 0.1508 1.483
## host_identity_verifiedt -64.4253 17.1962 -3.746
## availability_30 1.2442 0.3204 3.883
## review_scores_rating 33.7967 7.6944 4.392
## reviews_per_month -5.4174 1.9627 -2.760
## Pr(>|t|)
## (Intercept) 0.022336 *
## host_is_superhostt 0.015159 *
## accommodates 0.000001649396864246 ***
## bathrooms 0.000000000000000236 ***
## bedrooms 0.000000609972329033 ***
## neighbourhood_cleansedDowntown 0.461544
## neighbourhood_cleansedDowntown Eastside 0.888273
## neighbourhood_cleansedDunbar Southlands 0.300399
## neighbourhood_cleansedFairview 0.884622
## neighbourhood_cleansedGrandview-Woodland 0.329146
## neighbourhood_cleansedHastings-Sunrise 0.060960 .
## neighbourhood_cleansedKensington-Cedar Cottage 0.180503
## neighbourhood_cleansedKerrisdale 0.106339
## neighbourhood_cleansedKillarney 0.030994 *
## neighbourhood_cleansedKitsilano 0.398902
## neighbourhood_cleansedMarpole 0.045608 *
## neighbourhood_cleansedMount Pleasant 0.278176
## neighbourhood_cleansedOakridge 0.883828
## neighbourhood_cleansedRenfrew-Collingwood 0.080164 .
## neighbourhood_cleansedRiley Park 0.480872
## neighbourhood_cleansedShaughnessy 0.569976
## neighbourhood_cleansedSouth Cambie 0.507610
## neighbourhood_cleansedStrathcona 0.861878
## neighbourhood_cleansedSunset 0.090606 .
## neighbourhood_cleansedVictoria-Fraserview 0.007646 **
## neighbourhood_cleansedWest End 0.514659
## neighbourhood_cleansedWest Point Grey 0.052678 .
## minimum_nights 0.138057
## host_identity_verifiedt 0.000182 ***
## availability_30 0.000105 ***
## review_scores_rating 0.000011519513171064 ***
## reviews_per_month 0.005804 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 187.9 on 3786 degrees of freedom
## Multiple R-squared: 0.2222, Adjusted R-squared: 0.2158
## F-statistic: 34.88 on 31 and 3786 DF, p-value: < 0.00000000000000022
p = ggplot(air_clean, aes(y = air_clean$price, x = air_clean$accommodates)) + geom_point() + facet_wrap(~air_clean$bedrooms)
print(p + labs(title = "Price compared to accommodation size split by bedrooms (Model 2)", y = "Price", x = "Accommodates"))
p = ggplot(air_clean, aes(y = air_clean$price, x = air_clean$accommodates)) + geom_point() + facet_wrap(~air_clean$bathrooms) + geom_boxplot(aes(group = air_clean$accommodates))
print(p + labs(title = "Price compared to accommodation size split by bathrooms (Model 2)", y = "Price", x = "Accommodates"))
p = ggplot(air_clean, aes(y = air_clean$price, x = air_clean$bathrooms)) + geom_boxplot(aes(group = air_clean$bathrooms))
print(p + labs(title = "Price compared to bathrooms (Model 2)", y = "Price", x = "Bathrooms"))
p = ggplot(air_clean, aes(y = air_clean$price, x = air_clean$number_of_reviews)) + geom_point()
print(p + labs(title = "Price compared to the number of reviews (Model 2)", y = "Price", x = "Number of reviews"))
In this chunk of code I got rid of non-signficant neighbourhoods.
# Create a vector of significant neighbourhoods
significant_neighborhoods <- c("Downtown", "Downtown Eastside","Kitsilano", "Hastings-Sunrise","Kerrisdale", "Killarney","Marpole", "Mount Pleasant", "Oakridge", "Renfrew-Collingwood","Sunset", "Victoria-Fraserview", "West End")
# Filter the dataset to include only significant neighbourhoods
air_clean_sig <- air_clean %>%
filter(neighbourhood_cleansed %in% significant_neighborhoods)
# Fit a linear regression model with significant neighbourhoods
lm_model_significant_neighborhoods <- lm(price ~ host_is_superhost + host_response_time + host_response_rate + host_acceptance_rate +
accommodates + bathrooms + bedrooms + neighbourhood_cleansed + host_listings_count +
minimum_nights + maximum_nights + instant_bookable + host_identity_verified +
availability_30 + availability_60 + availability_90 + review_scores_rating + reviews_per_month + has_availability,
data = air_clean_sig)
# Perform stepwise selection on the model
step_model2 <- step(lm_model_significant_neighborhoods)
## Start: AIC=29951.86
## price ~ host_is_superhost + host_response_time + host_response_rate +
## host_acceptance_rate + accommodates + bathrooms + bedrooms +
## neighbourhood_cleansed + host_listings_count + minimum_nights +
## maximum_nights + instant_bookable + host_identity_verified +
## availability_30 + availability_60 + availability_90 + review_scores_rating +
## reviews_per_month + has_availability
##
## Df Sum of Sq RSS AIC
## - host_response_time 3 101525 119806674 29948
## - availability_60 1 1541 119706690 29950
## - maximum_nights 1 1962 119707111 29950
## - availability_90 1 2296 119707445 29950
## - has_availability 1 7962 119713111 29950
## - host_response_rate 1 9414 119714563 29950
## - host_listings_count 1 21399 119726548 29950
## - reviews_per_month 1 35951 119741100 29951
## - host_acceptance_rate 1 75517 119780666 29952
## <none> 119705149 29952
## - instant_bookable 1 91047 119796196 29952
## - availability_30 1 105595 119810744 29952
## - host_is_superhost 1 164460 119869609 29954
## - host_identity_verified 1 339175 120044324 29958
## - minimum_nights 1 374516 120079665 29959
## - bedrooms 1 452026 120157174 29960
## - review_scores_rating 1 546572 120251721 29963
## - accommodates 1 596566 120301715 29964
## - bathrooms 1 1888585 121593733 29994
## - neighbourhood_cleansed 12 3208111 122913260 30002
##
## Step: AIC=29948.23
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + maximum_nights + instant_bookable +
## host_identity_verified + availability_30 + availability_60 +
## availability_90 + review_scores_rating + reviews_per_month +
## has_availability
##
## Df Sum of Sq RSS AIC
## - maximum_nights 1 873 119807546 29946
## - availability_60 1 1430 119808103 29946
## - availability_90 1 2060 119808734 29946
## - has_availability 1 7384 119814057 29946
## - host_listings_count 1 28981 119835655 29947
## - reviews_per_month 1 43048 119849721 29947
## - host_response_rate 1 55112 119861786 29948
## - host_acceptance_rate 1 72992 119879665 29948
## <none> 119806674 29948
## - instant_bookable 1 101090 119907764 29949
## - availability_30 1 102165 119908839 29949
## - host_is_superhost 1 164373 119971047 29950
## - host_identity_verified 1 327028 120133702 29954
## - minimum_nights 1 380920 120187593 29955
## - bedrooms 1 454675 120261349 29957
## - review_scores_rating 1 570650 120377323 29960
## - accommodates 1 574102 120380776 29960
## - bathrooms 1 1893253 121699927 29990
## - neighbourhood_cleansed 12 3195984 123002658 29998
##
## Step: AIC=29946.25
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + instant_bookable +
## host_identity_verified + availability_30 + availability_60 +
## availability_90 + review_scores_rating + reviews_per_month +
## has_availability
##
## Df Sum of Sq RSS AIC
## - availability_60 1 1528 119809074 29944
## - availability_90 1 2192 119809738 29944
## - has_availability 1 7354 119814900 29944
## - host_listings_count 1 29050 119836596 29945
## - reviews_per_month 1 44116 119851662 29945
## - host_response_rate 1 55100 119862647 29946
## - host_acceptance_rate 1 73009 119880555 29946
## <none> 119807546 29946
## - instant_bookable 1 102209 119909755 29947
## - availability_30 1 102233 119909779 29947
## - host_is_superhost 1 166656 119974202 29948
## - host_identity_verified 1 326521 120134067 29952
## - minimum_nights 1 386681 120194228 29953
## - bedrooms 1 457110 120264657 29955
## - review_scores_rating 1 569840 120377387 29958
## - accommodates 1 573793 120381339 29958
## - bathrooms 1 1892509 121700056 29988
## - neighbourhood_cleansed 12 3201343 123008890 29996
##
## Step: AIC=29944.29
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + instant_bookable +
## host_identity_verified + availability_30 + availability_90 +
## review_scores_rating + reviews_per_month + has_availability
##
## Df Sum of Sq RSS AIC
## - availability_90 1 684 119809758 29942
## - has_availability 1 7609 119816683 29943
## - host_listings_count 1 29490 119838564 29943
## - reviews_per_month 1 44852 119853926 29943
## - host_response_rate 1 55376 119864450 29944
## - host_acceptance_rate 1 72955 119882029 29944
## <none> 119809074 29944
## - instant_bookable 1 102431 119911505 29945
## - host_is_superhost 1 165467 119974540 29946
## - availability_30 1 184374 119993448 29947
## - host_identity_verified 1 326405 120135478 29950
## - minimum_nights 1 388938 120198012 29951
## - bedrooms 1 456518 120265592 29953
## - review_scores_rating 1 570802 120379876 29956
## - accommodates 1 573783 120382857 29956
## - bathrooms 1 1891705 121700779 29986
## - neighbourhood_cleansed 12 3200402 123009476 29994
##
## Step: AIC=29942.31
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + instant_bookable +
## host_identity_verified + availability_30 + review_scores_rating +
## reviews_per_month + has_availability
##
## Df Sum of Sq RSS AIC
## - has_availability 1 7639 119817397 29941
## - host_listings_count 1 30268 119840026 29941
## - reviews_per_month 1 44519 119854277 29941
## - host_response_rate 1 56048 119865806 29942
## - host_acceptance_rate 1 74802 119884560 29942
## <none> 119809758 29942
## - instant_bookable 1 101962 119911720 29943
## - host_is_superhost 1 164806 119974564 29944
## - host_identity_verified 1 326146 120135904 29948
## - minimum_nights 1 389738 120199496 29949
## - bedrooms 1 456234 120265992 29951
## - availability_30 1 502692 120312450 29952
## - review_scores_rating 1 570724 120380482 29954
## - accommodates 1 575506 120385263 29954
## - bathrooms 1 1891289 121701047 29984
## - neighbourhood_cleansed 12 3200310 123010068 29992
##
## Step: AIC=29940.48
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + instant_bookable +
## host_identity_verified + availability_30 + review_scores_rating +
## reviews_per_month
##
## Df Sum of Sq RSS AIC
## - host_listings_count 1 30175 119847571 29939
## - reviews_per_month 1 45622 119863018 29940
## - host_response_rate 1 55535 119872932 29940
## - host_acceptance_rate 1 75002 119892398 29940
## <none> 119817397 29941
## - instant_bookable 1 103716 119921112 29941
## - host_is_superhost 1 167346 119984742 29942
## - host_identity_verified 1 325405 120142801 29946
## - minimum_nights 1 388735 120206131 29948
## - bedrooms 1 453820 120271217 29949
## - availability_30 1 505903 120323300 29950
## - review_scores_rating 1 567431 120384828 29952
## - accommodates 1 579283 120396680 29952
## - bathrooms 1 1890728 121708124 29982
## - neighbourhood_cleansed 12 3201610 123019007 29990
##
## Step: AIC=29939.19
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## minimum_nights + instant_bookable + host_identity_verified +
## availability_30 + review_scores_rating + reviews_per_month
##
## Df Sum of Sq RSS AIC
## - reviews_per_month 1 51904 119899476 29938
## - host_response_rate 1 53084 119900655 29938
## - host_acceptance_rate 1 75862 119923434 29939
## <none> 119847571 29939
## - instant_bookable 1 95542 119943113 29939
## - host_is_superhost 1 152961 120000532 29941
## - host_identity_verified 1 316752 120164323 29945
## - minimum_nights 1 393691 120241262 29946
## - bedrooms 1 463063 120310634 29948
## - availability_30 1 504716 120352287 29949
## - review_scores_rating 1 556903 120404474 29950
## - accommodates 1 572343 120419914 29951
## - bathrooms 1 1882700 121730271 29981
## - neighbourhood_cleansed 12 3261100 123108671 29990
##
## Step: AIC=29938.4
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## minimum_nights + instant_bookable + host_identity_verified +
## availability_30 + review_scores_rating
##
## Df Sum of Sq RSS AIC
## - host_response_rate 1 54963 119954439 29938
## - host_acceptance_rate 1 58087 119957563 29938
## <none> 119899476 29938
## - instant_bookable 1 105394 120004870 29939
## - host_is_superhost 1 134378 120033853 29940
## - host_identity_verified 1 317719 120217195 29944
## - bedrooms 1 473114 120372589 29947
## - availability_30 1 514875 120414350 29948
## - accommodates 1 537790 120437266 29949
## - review_scores_rating 1 541879 120441355 29949
## - minimum_nights 1 551125 120450601 29949
## - bathrooms 1 1975566 121875042 29982
## - neighbourhood_cleansed 12 3231552 123131028 29989
##
## Step: AIC=29937.69
## price ~ host_is_superhost + host_acceptance_rate + accommodates +
## bathrooms + bedrooms + neighbourhood_cleansed + minimum_nights +
## instant_bookable + host_identity_verified + availability_30 +
## review_scores_rating
##
## Df Sum of Sq RSS AIC
## - host_acceptance_rate 1 28616 119983055 29936
## <none> 119954439 29938
## - instant_bookable 1 105085 120059524 29938
## - host_is_superhost 1 125459 120079898 29939
## - host_identity_verified 1 322331 120276770 29943
## - bedrooms 1 466805 120421244 29947
## - review_scores_rating 1 533675 120488114 29948
## - accommodates 1 538126 120492565 29948
## - availability_30 1 542662 120497101 29948
## - minimum_nights 1 543823 120498262 29948
## - bathrooms 1 1983143 121937582 29982
## - neighbourhood_cleansed 12 3246192 123200631 29989
##
## Step: AIC=29936.36
## price ~ host_is_superhost + accommodates + bathrooms + bedrooms +
## neighbourhood_cleansed + minimum_nights + instant_bookable +
## host_identity_verified + availability_30 + review_scores_rating
##
## Df Sum of Sq RSS AIC
## <none> 119983055 29936
## - instant_bookable 1 86666 120069721 29936
## - host_is_superhost 1 158117 120141172 29938
## - host_identity_verified 1 326593 120309648 29942
## - bedrooms 1 459689 120442744 29945
## - minimum_nights 1 515214 120498269 29946
## - availability_30 1 535040 120518095 29947
## - review_scores_rating 1 535662 120518717 29947
## - accommodates 1 572401 120555456 29948
## - bathrooms 1 1959822 121942877 29980
## - neighbourhood_cleansed 12 3221564 123204619 29987
# Summary of the model with stepwise selection
summary(step_model2)
##
## Call:
## lm(formula = price ~ host_is_superhost + accommodates + bathrooms +
## bedrooms + neighbourhood_cleansed + minimum_nights + instant_bookable +
## host_identity_verified + availability_30 + review_scores_rating,
## data = air_clean_sig)
##
## Residuals:
## Min 1Q Median 3Q Max
## -589.8 -55.4 -13.4 29.0 8408.0
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -102.0570 51.7084 -1.974
## host_is_superhostt 15.8017 8.2542 1.914
## accommodates 13.4856 3.7024 3.642
## bathrooms 67.6098 10.0314 6.740
## bedrooms 30.9713 9.4883 3.264
## neighbourhood_cleansedDowntown Eastside -17.4988 15.8407 -1.105
## neighbourhood_cleansedHastings-Sunrise -73.2112 18.5153 -3.954
## neighbourhood_cleansedKerrisdale -74.8435 30.5068 -2.453
## neighbourhood_cleansedKillarney -92.2314 30.6138 -3.013
## neighbourhood_cleansedKitsilano 9.9534 13.8034 0.721
## neighbourhood_cleansedMarpole -81.2461 22.6167 -3.592
## neighbourhood_cleansedMount Pleasant 15.9457 15.1700 1.051
## neighbourhood_cleansedOakridge -14.6127 27.6033 -0.529
## neighbourhood_cleansedRenfrew-Collingwood -66.9938 17.5390 -3.820
## neighbourhood_cleansedSunset -70.4233 24.1900 -2.911
## neighbourhood_cleansedVictoria-Fraserview -107.5467 26.3020 -4.089
## neighbourhood_cleansedWest End -0.2730 14.1337 -0.019
## minimum_nights 0.5869 0.1698 3.456
## instant_bookablet -13.1850 9.3028 -1.417
## host_identity_verifiedt -64.4407 23.4216 -2.751
## availability_30 1.4743 0.4186 3.522
## review_scores_rating 33.1591 9.4106 3.524
## Pr(>|t|)
## (Intercept) 0.048514 *
## host_is_superhostt 0.055673 .
## accommodates 0.000275 ***
## bathrooms 0.0000000000192 ***
## bedrooms 0.001111 **
## neighbourhood_cleansedDowntown Eastside 0.269396
## neighbourhood_cleansedHastings-Sunrise 0.0000787425421 ***
## neighbourhood_cleansedKerrisdale 0.014215 *
## neighbourhood_cleansedKillarney 0.002612 **
## neighbourhood_cleansedKitsilano 0.470921
## neighbourhood_cleansedMarpole 0.000333 ***
## neighbourhood_cleansedMount Pleasant 0.293288
## neighbourhood_cleansedOakridge 0.596583
## neighbourhood_cleansedRenfrew-Collingwood 0.000137 ***
## neighbourhood_cleansedSunset 0.003628 **
## neighbourhood_cleansedVictoria-Fraserview 0.0000445670787 ***
## neighbourhood_cleansedWest End 0.984590
## minimum_nights 0.000557 ***
## instant_bookablet 0.156503
## host_identity_verifiedt 0.005974 **
## availability_30 0.000436 ***
## review_scores_rating 0.000433 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 207.7 on 2781 degrees of freedom
## Multiple R-squared: 0.1718, Adjusted R-squared: 0.1655
## F-statistic: 27.47 on 21 and 2781 DF, p-value: < 0.00000000000000022
p = ggplot(air_clean_sig, aes(y = air_clean_sig$price, x = air_clean_sig$accommodates)) + geom_point() + facet_wrap(~air_clean_sig$bedrooms)
print(p + labs(title = "Price compared to accommodation size split by bedrooms", y = "Price", x = "Accommodates"))
p = ggplot(air_clean_sig, aes(y = air_clean_sig$price, x = air_clean_sig$accommodates)) + geom_point() + facet_wrap(~air_clean_sig$bathrooms) + geom_boxplot(aes(group = air_clean_sig$accommodates))
print(p + labs(title = "Price compared to accommodation size split by bathrooms", y = "Price", x = "Accommodates"))
p = ggplot(air_clean_sig, aes(y = air_clean_sig$price, x = air_clean_sig$bathrooms)) + geom_boxplot(aes(group = air_clean_sig$bathrooms))
print(p + labs(title = "Price compared to bathrooms", y = "Price", x = "Bathrooms"))
p = ggplot(air_clean_sig, aes(y = air_clean_sig$price, x = air_clean_sig$number_of_reviews)) + geom_point()
print(p + labs(title = "Price compared to the number of reviews", y = "Price", x = "Number of reviews"))
In this code I transformed the data so that it removed extreme outliers form the data
# Function to detect outliers using IQR and remove them
remove_outliers <- function(data, variable) {
q1 <- quantile(data[[variable]], 0.25)
q3 <- quantile(data[[variable]], 0.75)
iqr <- q3 - q1
lower_bound <- q1 - 1.5 * iqr
upper_bound <- q3 + 4.5 * iqr
# Remove outliers from the data frame
filtered_data <- data[!(data[[variable]] < lower_bound | data[[variable]] > upper_bound), ]
return(filtered_data)
}
# Remove outliers from the 'price' variable in air_clean
air_clean_filtered <- remove_outliers(air_clean, 'price')
p = ggplot(air_clean_filtered, aes(y = air_clean_filtered$price, x = air_clean_filtered$accommodates)) + geom_point() + facet_wrap(~air_clean_filtered$bedrooms)
print(p + labs(title = "Price compared to accommodation size split by bedrooms (Model 3)", y = "Price", x = "Accommodates"))
p = ggplot(air_clean_filtered, aes(y = air_clean_filtered$price, x = air_clean_filtered$accommodates)) + geom_point() + facet_wrap(~air_clean_filtered$bathrooms)
#+ geom_boxplot(aes(group = air_clean_filtered$accommodates))
print(p + labs(title = "Price compared to accommodation size split by bathrooms (Model 3)", y = "Price", x = "Accommodates"))
p = ggplot(air_clean_filtered, aes(y = air_clean_filtered$price, x = air_clean_filtered$bathrooms)) + geom_boxplot(aes(group = air_clean_filtered$bathrooms)) + geom_smooth()
print(p + labs(title = "Price compared to bathrooms (Model 3)", y = "Price", x = "Bathrooms"))
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
p = ggplot(air_clean_filtered, aes(y = air_clean_filtered$price, x = air_clean_filtered$number_of_reviews)) + geom_point()
print(p + labs(title = "Price compared to the number of reviews (Model 3)", y = "Price", x = "Number of reviews"))
This model has the data without outliers and with all neighbourhoods.
# Fit a linear regression model with filtered data
updated_model <- lm(price ~ host_is_superhost + host_response_time +
host_response_rate + host_acceptance_rate +
accommodates + bathrooms + bedrooms +
neighbourhood_cleansed + host_listings_count +
minimum_nights + maximum_nights + instant_bookable +
host_identity_verified + availability_30 +
availability_60 + availability_90 +
review_scores_rating + reviews_per_month,
data = air_clean_filtered)
# Summary of the updated model
summary(updated_model)
##
## Call:
## lm(formula = price ~ host_is_superhost + host_response_time +
## host_response_rate + host_acceptance_rate + accommodates +
## bathrooms + bedrooms + neighbourhood_cleansed + host_listings_count +
## minimum_nights + maximum_nights + instant_bookable + host_identity_verified +
## availability_30 + availability_60 + availability_90 + review_scores_rating +
## reviews_per_month, data = air_clean_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -411.17 -41.58 -10.67 26.03 531.67
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -57.983961 22.174989 -2.615
## host_is_superhostt 13.947462 2.499798 5.579
## host_response_timewithin a day -18.781762 19.987237 -0.940
## host_response_timewithin a few hours -17.400411 21.185949 -0.821
## host_response_timewithin an hour -16.127042 21.405478 -0.753
## host_response_rate -23.410321 21.871553 -1.070
## host_acceptance_rate 14.932259 8.179617 1.826
## accommodates 10.940097 1.109111 9.864
## bathrooms 31.063010 2.892050 10.741
## bedrooms 37.056934 2.801245 13.229
## neighbourhood_cleansedDowntown 40.631438 10.220205 3.976
## neighbourhood_cleansedDowntown Eastside 26.259151 11.105902 2.364
## neighbourhood_cleansedDunbar Southlands -12.188136 12.079041 -1.009
## neighbourhood_cleansedFairview 24.229962 12.461023 1.944
## neighbourhood_cleansedGrandview-Woodland -3.173455 11.606267 -0.273
## neighbourhood_cleansedHastings-Sunrise -31.746715 11.503076 -2.760
## neighbourhood_cleansedKensington-Cedar Cottage -18.842178 11.002814 -1.712
## neighbourhood_cleansedKerrisdale -48.170640 14.177506 -3.398
## neighbourhood_cleansedKillarney -49.508179 14.074920 -3.517
## neighbourhood_cleansedKitsilano 42.100907 10.765304 3.911
## neighbourhood_cleansedMarpole -36.050017 12.319479 -2.926
## neighbourhood_cleansedMount Pleasant 18.524095 10.988627 1.686
## neighbourhood_cleansedOakridge -32.931088 13.556107 -2.429
## neighbourhood_cleansedRenfrew-Collingwood -31.604184 11.356458 -2.783
## neighbourhood_cleansedRiley Park -3.728424 11.090988 -0.336
## neighbourhood_cleansedShaughnessy -14.756341 13.514720 -1.092
## neighbourhood_cleansedSouth Cambie -2.487114 13.957394 -0.178
## neighbourhood_cleansedStrathcona 19.861608 17.378926 1.143
## neighbourhood_cleansedSunset -32.298030 12.669553 -2.549
## neighbourhood_cleansedVictoria-Fraserview -59.136488 13.048136 -4.532
## neighbourhood_cleansedWest End 38.452235 10.887924 3.532
## neighbourhood_cleansedWest Point Grey 17.801929 14.160330 1.257
## host_listings_count 0.117750 0.037089 3.175
## minimum_nights -0.623033 0.059016 -10.557
## maximum_nights -0.002371 0.002463 -0.963
## instant_bookablet -5.350815 2.800213 -1.911
## host_identity_verifiedt -3.991732 6.492820 -0.615
## availability_30 1.159694 0.285848 4.057
## availability_60 -0.474867 0.280380 -1.694
## availability_90 0.211564 0.140628 1.504
## review_scores_rating 24.125235 2.879938 8.377
## reviews_per_month -5.242094 0.762804 -6.872
## Pr(>|t|)
## (Intercept) 0.008963 **
## host_is_superhostt 0.00000002584819 ***
## host_response_timewithin a day 0.347439
## host_response_timewithin a few hours 0.411517
## host_response_timewithin an hour 0.451253
## host_response_rate 0.284529
## host_acceptance_rate 0.067999 .
## accommodates < 0.0000000000000002 ***
## bathrooms < 0.0000000000000002 ***
## bedrooms < 0.0000000000000002 ***
## neighbourhood_cleansedDowntown 0.00007153399552 ***
## neighbourhood_cleansedDowntown Eastside 0.018109 *
## neighbourhood_cleansedDunbar Southlands 0.313025
## neighbourhood_cleansedFairview 0.051915 .
## neighbourhood_cleansedGrandview-Woodland 0.784541
## neighbourhood_cleansedHastings-Sunrise 0.005811 **
## neighbourhood_cleansedKensington-Cedar Cottage 0.086890 .
## neighbourhood_cleansedKerrisdale 0.000687 ***
## neighbourhood_cleansedKillarney 0.000441 ***
## neighbourhood_cleansedKitsilano 0.00009363165656 ***
## neighbourhood_cleansedMarpole 0.003451 **
## neighbourhood_cleansedMount Pleasant 0.091927 .
## neighbourhood_cleansedOakridge 0.015177 *
## neighbourhood_cleansedRenfrew-Collingwood 0.005414 **
## neighbourhood_cleansedRiley Park 0.736764
## neighbourhood_cleansedShaughnessy 0.274960
## neighbourhood_cleansedSouth Cambie 0.858581
## neighbourhood_cleansedStrathcona 0.253172
## neighbourhood_cleansedSunset 0.010835 *
## neighbourhood_cleansedVictoria-Fraserview 0.00000602112568 ***
## neighbourhood_cleansedWest End 0.000418 ***
## neighbourhood_cleansedWest Point Grey 0.208771
## host_listings_count 0.001512 **
## minimum_nights < 0.0000000000000002 ***
## maximum_nights 0.335804
## instant_bookablet 0.056099 .
## host_identity_verifiedt 0.538730
## availability_30 0.00005071876326 ***
## availability_60 0.090414 .
## availability_90 0.132558
## review_scores_rating < 0.0000000000000002 ***
## reviews_per_month 0.00000000000738 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 69.47 on 3725 degrees of freedom
## Multiple R-squared: 0.5254, Adjusted R-squared: 0.5202
## F-statistic: 100.6 on 41 and 3725 DF, p-value: < 0.00000000000000022
# Perform stepwise selection on the updated model
step_model3 <- step(updated_model)
## Start: AIC=31992.58
## price ~ host_is_superhost + host_response_time + host_response_rate +
## host_acceptance_rate + accommodates + bathrooms + bedrooms +
## neighbourhood_cleansed + host_listings_count + minimum_nights +
## maximum_nights + instant_bookable + host_identity_verified +
## availability_30 + availability_60 + availability_90 + review_scores_rating +
## reviews_per_month
##
## Df Sum of Sq RSS AIC
## - host_response_time 3 6420 17983157 31988
## - host_identity_verified 1 1824 17978561 31991
## - maximum_nights 1 4472 17981209 31992
## - host_response_rate 1 5529 17982266 31992
## <none> 17976737 31993
## - availability_90 1 10923 17987659 31993
## - availability_60 1 13843 17990580 31993
## - host_acceptance_rate 1 16083 17992820 31994
## - instant_bookable 1 17621 17994358 31994
## - host_listings_count 1 48643 18025379 32001
## - availability_30 1 79433 18056170 32007
## - host_is_superhost 1 150233 18126970 32022
## - reviews_per_month 1 227912 18204649 32038
## - review_scores_rating 1 338658 18315395 32061
## - accommodates 1 469544 18446281 32088
## - minimum_nights 1 537856 18514593 32102
## - bathrooms 1 556750 18533486 32105
## - bedrooms 1 844542 18821279 32164
## - neighbourhood_cleansed 22 3320006 21296743 32587
##
## Step: AIC=31987.93
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + maximum_nights + instant_bookable +
## host_identity_verified + availability_30 + availability_60 +
## availability_90 + review_scores_rating + reviews_per_month
##
## Df Sum of Sq RSS AIC
## - host_identity_verified 1 1943 17985100 31986
## - maximum_nights 1 4564 17987721 31987
## <none> 17983157 31988
## - availability_90 1 11211 17994368 31988
## - availability_60 1 13867 17997024 31989
## - instant_bookable 1 16118 17999275 31989
## - host_acceptance_rate 1 20384 18003541 31990
## - host_response_rate 1 42187 18025344 31995
## - host_listings_count 1 49403 18032560 31996
## - availability_30 1 77876 18061033 32002
## - host_is_superhost 1 154372 18137529 32018
## - reviews_per_month 1 227036 18210193 32033
## - review_scores_rating 1 343863 18327020 32057
## - accommodates 1 472461 18455618 32084
## - minimum_nights 1 545617 18528774 32099
## - bathrooms 1 555655 18538812 32101
## - bedrooms 1 844522 18827679 32159
## - neighbourhood_cleansed 22 3315671 21298828 32581
##
## Step: AIC=31986.33
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + maximum_nights + instant_bookable +
## availability_30 + availability_60 + availability_90 + review_scores_rating +
## reviews_per_month
##
## Df Sum of Sq RSS AIC
## - maximum_nights 1 4722 17989822 31985
## <none> 17985100 31986
## - availability_90 1 11174 17996274 31987
## - availability_60 1 13967 17999067 31987
## - instant_bookable 1 15927 18001028 31988
## - host_acceptance_rate 1 20649 18005749 31989
## - host_response_rate 1 42567 18027668 31993
## - host_listings_count 1 48613 18033713 31995
## - availability_30 1 78304 18063404 32001
## - host_is_superhost 1 152620 18137720 32016
## - reviews_per_month 1 227711 18212811 32032
## - review_scores_rating 1 344806 18329906 32056
## - accommodates 1 472261 18457361 32082
## - minimum_nights 1 545159 18530259 32097
## - bathrooms 1 556147 18541247 32099
## - bedrooms 1 844428 18829528 32157
## - neighbourhood_cleansed 22 3313948 21299048 32579
##
## Step: AIC=31985.32
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + instant_bookable +
## availability_30 + availability_60 + availability_90 + review_scores_rating +
## reviews_per_month
##
## Df Sum of Sq RSS AIC
## <none> 17989822 31985
## - availability_90 1 10742 18000564 31986
## - availability_60 1 13664 18003486 31986
## - instant_bookable 1 15478 18005300 31987
## - host_acceptance_rate 1 20382 18010204 31988
## - host_response_rate 1 42659 18032482 31992
## - host_listings_count 1 48652 18038474 31993
## - availability_30 1 78937 18068759 32000
## - host_is_superhost 1 150090 18139912 32015
## - reviews_per_month 1 224736 18214558 32030
## - review_scores_rating 1 349248 18339070 32056
## - accommodates 1 471305 18461127 32081
## - bathrooms 1 557933 18547756 32098
## - minimum_nights 1 560158 18549980 32099
## - bedrooms 1 842009 18831832 32156
## - neighbourhood_cleansed 22 3310635 21300457 32578
# Summary of the model with stepwise selection
summary(step_model3)
##
## Call:
## lm(formula = price ~ host_is_superhost + host_response_rate +
## host_acceptance_rate + accommodates + bathrooms + bedrooms +
## neighbourhood_cleansed + host_listings_count + minimum_nights +
## instant_bookable + availability_30 + availability_60 + availability_90 +
## review_scores_rating + reviews_per_month, data = air_clean_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -409.80 -41.29 -10.55 25.80 530.25
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -69.21271 20.63659 -3.354
## host_is_superhostt 13.85318 2.48332 5.578
## host_response_rate -36.38087 12.23279 -2.974
## host_acceptance_rate 16.20087 7.88086 2.056
## accommodates 10.94905 1.10760 9.885
## bathrooms 31.08663 2.89029 10.756
## bedrooms 36.98650 2.79926 13.213
## neighbourhood_cleansedDowntown 40.47065 10.21452 3.962
## neighbourhood_cleansedDowntown Eastside 26.15010 11.10106 2.356
## neighbourhood_cleansedDunbar Southlands -11.68747 12.06476 -0.969
## neighbourhood_cleansedFairview 23.79987 12.44651 1.912
## neighbourhood_cleansedGrandview-Woodland -2.72451 11.59912 -0.235
## neighbourhood_cleansedHastings-Sunrise -31.74231 11.49686 -2.761
## neighbourhood_cleansedKensington-Cedar Cottage -18.53389 10.99567 -1.686
## neighbourhood_cleansedKerrisdale -48.28388 14.17200 -3.407
## neighbourhood_cleansedKillarney -49.23182 14.06697 -3.500
## neighbourhood_cleansedKitsilano 42.07129 10.76013 3.910
## neighbourhood_cleansedMarpole -36.48172 12.31180 -2.963
## neighbourhood_cleansedMount Pleasant 18.55323 10.98336 1.689
## neighbourhood_cleansedOakridge -33.29184 13.52923 -2.461
## neighbourhood_cleansedRenfrew-Collingwood -31.23351 11.34767 -2.752
## neighbourhood_cleansedRiley Park -3.55074 11.08524 -0.320
## neighbourhood_cleansedShaughnessy -13.96721 13.47064 -1.037
## neighbourhood_cleansedSouth Cambie -2.49874 13.94926 -0.179
## neighbourhood_cleansedStrathcona 19.79825 17.36209 1.140
## neighbourhood_cleansedSunset -32.03609 12.66234 -2.530
## neighbourhood_cleansedVictoria-Fraserview -58.74530 13.04185 -4.504
## neighbourhood_cleansedWest End 38.20544 10.88267 3.511
## neighbourhood_cleansedWest Point Grey 18.03039 14.15079 1.274
## host_listings_count 0.11741 0.03697 3.176
## minimum_nights -0.63169 0.05862 -10.777
## instant_bookablet -4.97598 2.77767 -1.791
## availability_30 1.15493 0.28548 4.046
## availability_60 -0.47166 0.28022 -1.683
## availability_90 0.20954 0.14040 1.492
## review_scores_rating 24.36282 2.86299 8.510
## reviews_per_month -5.13073 0.75163 -6.826
## Pr(>|t|)
## (Intercept) 0.000805 ***
## host_is_superhostt 0.0000000259865 ***
## host_response_rate 0.002958 **
## host_acceptance_rate 0.039879 *
## accommodates < 0.0000000000000002 ***
## bathrooms < 0.0000000000000002 ***
## bedrooms < 0.0000000000000002 ***
## neighbourhood_cleansedDowntown 0.0000756915373 ***
## neighbourhood_cleansedDowntown Eastside 0.018542 *
## neighbourhood_cleansedDunbar Southlands 0.332744
## neighbourhood_cleansedFairview 0.055931 .
## neighbourhood_cleansedGrandview-Woodland 0.814308
## neighbourhood_cleansedHastings-Sunrise 0.005791 **
## neighbourhood_cleansedKensington-Cedar Cottage 0.091964 .
## neighbourhood_cleansedKerrisdale 0.000664 ***
## neighbourhood_cleansedKillarney 0.000471 ***
## neighbourhood_cleansedKitsilano 0.0000939666432 ***
## neighbourhood_cleansedMarpole 0.003064 **
## neighbourhood_cleansedMount Pleasant 0.091262 .
## neighbourhood_cleansedOakridge 0.013910 *
## neighbourhood_cleansedRenfrew-Collingwood 0.005944 **
## neighbourhood_cleansedRiley Park 0.748749
## neighbourhood_cleansedShaughnessy 0.299867
## neighbourhood_cleansedSouth Cambie 0.857845
## neighbourhood_cleansedStrathcona 0.254228
## neighbourhood_cleansedSunset 0.011446 *
## neighbourhood_cleansedVictoria-Fraserview 0.0000068608695 ***
## neighbourhood_cleansedWest End 0.000452 ***
## neighbourhood_cleansedWest Point Grey 0.202685
## host_listings_count 0.001505 **
## minimum_nights < 0.0000000000000002 ***
## instant_bookablet 0.073307 .
## availability_30 0.0000532489686 ***
## availability_60 0.092430 .
## availability_90 0.135684
## review_scores_rating < 0.0000000000000002 ***
## reviews_per_month 0.0000000000101 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 69.45 on 3730 degrees of freedom
## Multiple R-squared: 0.5251, Adjusted R-squared: 0.5205
## F-statistic: 114.6 on 36 and 3730 DF, p-value: < 0.00000000000000022
This model has the outliers removed and only significant neighbourhoods
# Filter the dataset to include only significant neighbourhoods with filtered data
air_clean_sig2 <- air_clean_filtered %>%
filter(neighbourhood_cleansed %in% significant_neighborhoods)
# Fit a linear regression model with significant neighbourhoods and filtered data
lm_model_significant_neighborhoods2 <- lm(price ~ host_is_superhost + host_response_time +
host_response_rate + host_acceptance_rate +
accommodates + bathrooms + bedrooms +
neighbourhood_cleansed + host_listings_count +
minimum_nights + maximum_nights + instant_bookable +
host_identity_verified + availability_30 +
availability_60 + availability_90 +
review_scores_rating + reviews_per_month,
data = air_clean_sig2)
# Get the summary of the model with significant neighbourhoods and filtered data
summary(lm_model_significant_neighborhoods2)
##
## Call:
## lm(formula = price ~ host_is_superhost + host_response_time +
## host_response_rate + host_acceptance_rate + accommodates +
## bathrooms + bedrooms + neighbourhood_cleansed + host_listings_count +
## minimum_nights + maximum_nights + instant_bookable + host_identity_verified +
## availability_30 + availability_60 + availability_90 + review_scores_rating +
## reviews_per_month, data = air_clean_sig2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -388.50 -41.57 -10.17 25.36 529.06
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -33.494926 22.484505 -1.490
## host_is_superhostt 16.486253 2.898294 5.688
## host_response_timewithin a day -6.955686 22.554585 -0.308
## host_response_timewithin a few hours -12.246510 23.949606 -0.511
## host_response_timewithin an hour -6.802423 24.230589 -0.281
## host_response_rate -22.265309 24.067020 -0.925
## host_acceptance_rate 15.367224 9.896277 1.553
## accommodates 10.368207 1.268506 8.174
## bathrooms 32.493244 3.477076 9.345
## bedrooms 31.953386 3.253569 9.821
## neighbourhood_cleansedDowntown Eastside -14.416014 5.324429 -2.708
## neighbourhood_cleansedHastings-Sunrise -70.085293 6.225902 -11.257
## neighbourhood_cleansedKerrisdale -84.529912 10.403238 -8.125
## neighbourhood_cleansedKillarney -86.299194 10.261253 -8.410
## neighbourhood_cleansedKitsilano 4.753028 4.725043 1.006
## neighbourhood_cleansedMarpole -75.113139 7.578996 -9.911
## neighbourhood_cleansedMount Pleasant -20.111173 5.126506 -3.923
## neighbourhood_cleansedOakridge -69.532567 9.528276 -7.297
## neighbourhood_cleansedRenfrew-Collingwood -69.836883 5.926396 -11.784
## neighbourhood_cleansedSunset -69.170735 8.170675 -8.466
## neighbourhood_cleansedVictoria-Fraserview -96.241632 8.817099 -10.915
## neighbourhood_cleansedWest End -1.838209 4.817536 -0.382
## host_listings_count 0.147620 0.038198 3.865
## minimum_nights -0.524097 0.065133 -8.047
## maximum_nights -0.001959 0.002901 -0.675
## instant_bookablet -6.000667 3.245169 -1.849
## host_identity_verifiedt -0.496510 7.941801 -0.063
## availability_30 0.961190 0.328615 2.925
## availability_60 -0.224764 0.318781 -0.705
## availability_90 0.199924 0.160358 1.247
## review_scores_rating 24.092014 3.174877 7.588
## reviews_per_month -3.820635 0.858622 -4.450
## Pr(>|t|)
## (Intercept) 0.136421
## host_is_superhostt 0.000000014191313129 ***
## host_response_timewithin a day 0.757806
## host_response_timewithin a few hours 0.609151
## host_response_timewithin an hour 0.778933
## host_response_rate 0.354976
## host_acceptance_rate 0.120579
## accommodates 0.000000000000000452 ***
## bathrooms < 0.0000000000000002 ***
## bedrooms < 0.0000000000000002 ***
## neighbourhood_cleansedDowntown Eastside 0.006821 **
## neighbourhood_cleansedHastings-Sunrise < 0.0000000000000002 ***
## neighbourhood_cleansedKerrisdale 0.000000000000000668 ***
## neighbourhood_cleansedKillarney < 0.0000000000000002 ***
## neighbourhood_cleansedKitsilano 0.314542
## neighbourhood_cleansedMarpole < 0.0000000000000002 ***
## neighbourhood_cleansedMount Pleasant 0.000089608773389718 ***
## neighbourhood_cleansedOakridge 0.000000000000382087 ***
## neighbourhood_cleansedRenfrew-Collingwood < 0.0000000000000002 ***
## neighbourhood_cleansedSunset < 0.0000000000000002 ***
## neighbourhood_cleansedVictoria-Fraserview < 0.0000000000000002 ***
## neighbourhood_cleansedWest End 0.702813
## host_listings_count 0.000114 ***
## minimum_nights 0.000000000000001255 ***
## maximum_nights 0.499538
## instant_bookablet 0.064550 .
## host_identity_verifiedt 0.950154
## availability_30 0.003473 **
## availability_60 0.480825
## availability_90 0.212599
## review_scores_rating 0.000000000000044119 ***
## reviews_per_month 0.000008940876595746 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 69.37 on 2741 degrees of freedom
## Multiple R-squared: 0.4962, Adjusted R-squared: 0.4905
## F-statistic: 87.08 on 31 and 2741 DF, p-value: < 0.00000000000000022
# Perform stepwise selection on the model
step_model4 <- step(lm_model_significant_neighborhoods2)
## Start: AIC=23543.67
## price ~ host_is_superhost + host_response_time + host_response_rate +
## host_acceptance_rate + accommodates + bathrooms + bedrooms +
## neighbourhood_cleansed + host_listings_count + minimum_nights +
## maximum_nights + instant_bookable + host_identity_verified +
## availability_30 + availability_60 + availability_90 + review_scores_rating +
## reviews_per_month
##
## Df Sum of Sq RSS AIC
## - host_response_time 3 10095 13199573 23540
## - host_identity_verified 1 19 13189497 23542
## - maximum_nights 1 2194 13191673 23542
## - availability_60 1 2392 13191870 23542
## - host_response_rate 1 4118 13193597 23543
## - availability_90 1 7479 13196958 23543
## <none> 13189478 23544
## - host_acceptance_rate 1 11603 13201081 23544
## - instant_bookable 1 16453 13205931 23545
## - availability_30 1 41168 13230646 23550
## - host_listings_count 1 71865 13261343 23557
## - reviews_per_month 1 95276 13284755 23562
## - host_is_superhost 1 155696 13345174 23574
## - review_scores_rating 1 277084 13466562 23599
## - minimum_nights 1 311558 13501036 23606
## - accommodates 1 321470 13510948 23608
## - bathrooms 1 420220 13609698 23629
## - bedrooms 1 464122 13653600 23638
## - neighbourhood_cleansed 12 2709338 15898816 24038
##
## Step: AIC=23539.79
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + maximum_nights + instant_bookable +
## host_identity_verified + availability_30 + availability_60 +
## availability_90 + review_scores_rating + reviews_per_month
##
## Df Sum of Sq RSS AIC
## - host_identity_verified 1 53 13199626 23538
## - maximum_nights 1 1720 13201293 23538
## - availability_60 1 2402 13201975 23538
## - availability_90 1 7846 13207419 23539
## <none> 13199573 23540
## - instant_bookable 1 13880 13213453 23541
## - host_acceptance_rate 1 17336 13216909 23541
## - host_response_rate 1 22322 13221895 23543
## - availability_30 1 40689 13240262 23546
## - host_listings_count 1 69024 13268598 23552
## - reviews_per_month 1 90398 13289971 23557
## - host_is_superhost 1 160846 13360419 23571
## - review_scores_rating 1 277091 13476664 23595
## - minimum_nights 1 319454 13519027 23604
## - accommodates 1 328047 13527620 23606
## - bathrooms 1 421017 13620590 23625
## - bedrooms 1 464058 13663632 23634
## - neighbourhood_cleansed 12 2718313 15917886 24035
##
## Step: AIC=23537.8
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + maximum_nights + instant_bookable +
## availability_30 + availability_60 + availability_90 + review_scores_rating +
## reviews_per_month
##
## Df Sum of Sq RSS AIC
## - maximum_nights 1 1733 13201359 23536
## - availability_60 1 2402 13202028 23536
## - availability_90 1 7837 13207463 23537
## <none> 13199626 23538
## - instant_bookable 1 13889 13213515 23539
## - host_acceptance_rate 1 17381 13217008 23540
## - host_response_rate 1 22365 13221991 23541
## - availability_30 1 40734 13240360 23544
## - host_listings_count 1 68993 13268620 23550
## - reviews_per_month 1 90500 13290126 23555
## - host_is_superhost 1 161122 13360748 23569
## - review_scores_rating 1 277150 13476776 23593
## - minimum_nights 1 319411 13519038 23602
## - accommodates 1 328033 13527660 23604
## - bathrooms 1 421005 13620631 23623
## - bedrooms 1 464057 13663684 23632
## - neighbourhood_cleansed 12 2719935 15919562 24033
##
## Step: AIC=23536.17
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + instant_bookable +
## availability_30 + availability_60 + availability_90 + review_scores_rating +
## reviews_per_month
##
## Df Sum of Sq RSS AIC
## - availability_60 1 2243 13203602 23535
## - availability_90 1 7516 13208875 23536
## <none> 13201359 23536
## - instant_bookable 1 13450 13214809 23537
## - host_acceptance_rate 1 17356 13218715 23538
## - host_response_rate 1 22353 13223712 23539
## - availability_30 1 40730 13242089 23543
## - host_listings_count 1 68831 13270190 23549
## - reviews_per_month 1 89178 13290537 23553
## - host_is_superhost 1 159619 13360978 23568
## - review_scores_rating 1 279909 13481268 23592
## - minimum_nights 1 325185 13526544 23602
## - accommodates 1 328369 13529728 23602
## - bathrooms 1 421624 13622983 23621
## - bedrooms 1 462455 13663814 23630
## - neighbourhood_cleansed 12 2718799 15920158 24032
##
## Step: AIC=23534.64
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + instant_bookable +
## availability_30 + availability_90 + review_scores_rating +
## reviews_per_month
##
## Df Sum of Sq RSS AIC
## - availability_90 1 8900 13212502 23535
## <none> 13203602 23535
## - instant_bookable 1 13551 13217152 23536
## - host_acceptance_rate 1 17347 13220949 23536
## - host_response_rate 1 22536 13226138 23537
## - availability_30 1 60512 13264113 23545
## - host_listings_count 1 69659 13273261 23547
## - reviews_per_month 1 90517 13294119 23552
## - host_is_superhost 1 158177 13361779 23566
## - review_scores_rating 1 280634 13484235 23591
## - minimum_nights 1 323693 13527295 23600
## - accommodates 1 328351 13531953 23601
## - bathrooms 1 421044 13624646 23620
## - bedrooms 1 461902 13665503 23628
## - neighbourhood_cleansed 12 2716602 15920203 24030
##
## Step: AIC=23534.51
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + instant_bookable +
## availability_30 + review_scores_rating + reviews_per_month
##
## Df Sum of Sq RSS AIC
## <none> 13212502 23535
## - instant_bookable 1 12753 13225255 23535
## - host_acceptance_rate 1 19811 13232312 23537
## - host_response_rate 1 23811 13236313 23538
## - host_listings_count 1 73465 13285967 23548
## - reviews_per_month 1 88623 13301125 23551
## - host_is_superhost 1 154495 13366996 23565
## - availability_30 1 252119 13464620 23585
## - review_scores_rating 1 280320 13492821 23591
## - minimum_nights 1 321858 13534359 23599
## - accommodates 1 332452 13544954 23601
## - bathrooms 1 420075 13632577 23619
## - bedrooms 1 459982 13672484 23627
## - neighbourhood_cleansed 12 2714326 15926828 24029
# Summary of the model with stepwise selection
summary(step_model4)
##
## Call:
## lm(formula = price ~ host_is_superhost + host_response_rate +
## host_acceptance_rate + accommodates + bathrooms + bedrooms +
## neighbourhood_cleansed + host_listings_count + minimum_nights +
## instant_bookable + availability_30 + review_scores_rating +
## reviews_per_month, data = air_clean_sig2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -393.23 -41.77 -10.48 25.41 525.80
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -36.89800 19.94690 -1.850
## host_is_superhostt 16.26849 2.86995 5.669
## host_response_rate -30.43399 13.67577 -2.225
## host_acceptance_rate 19.16596 9.44204 2.030
## accommodates 10.52074 1.26522 8.315
## bathrooms 32.48178 3.47505 9.347
## bedrooms 31.76318 3.24741 9.781
## neighbourhood_cleansedDowntown Eastside -14.81689 5.31292 -2.789
## neighbourhood_cleansedHastings-Sunrise -70.45222 6.20485 -11.354
## neighbourhood_cleansedKerrisdale -85.34523 10.38384 -8.219
## neighbourhood_cleansedKillarney -85.64904 10.24327 -8.361
## neighbourhood_cleansedKitsilano 4.23298 4.69045 0.902
## neighbourhood_cleansedMarpole -75.16791 7.56539 -9.936
## neighbourhood_cleansedMount Pleasant -20.49215 5.11060 -4.010
## neighbourhood_cleansedOakridge -70.54750 9.49403 -7.431
## neighbourhood_cleansedRenfrew-Collingwood -69.46372 5.90801 -11.758
## neighbourhood_cleansedSunset -69.02221 8.15015 -8.469
## neighbourhood_cleansedVictoria-Fraserview -95.93951 8.79977 -10.903
## neighbourhood_cleansedWest End -1.92384 4.80865 -0.400
## host_listings_count 0.14821 0.03792 3.909
## minimum_nights -0.52930 0.06469 -8.182
## instant_bookablet -5.23210 3.21259 -1.629
## availability_30 1.02492 0.14154 7.241
## review_scores_rating 24.07857 3.15347 7.636
## reviews_per_month -3.62517 0.84438 -4.293
## Pr(>|t|)
## (Intercept) 0.06445 .
## host_is_superhostt 0.000000015898367023 ***
## host_response_rate 0.02614 *
## host_acceptance_rate 0.04247 *
## accommodates < 0.0000000000000002 ***
## bathrooms < 0.0000000000000002 ***
## bedrooms < 0.0000000000000002 ***
## neighbourhood_cleansedDowntown Eastside 0.00533 **
## neighbourhood_cleansedHastings-Sunrise < 0.0000000000000002 ***
## neighbourhood_cleansedKerrisdale 0.000000000000000312 ***
## neighbourhood_cleansedKillarney < 0.0000000000000002 ***
## neighbourhood_cleansedKitsilano 0.36689
## neighbourhood_cleansedMarpole < 0.0000000000000002 ***
## neighbourhood_cleansedMount Pleasant 0.000062405504155170 ***
## neighbourhood_cleansedOakridge 0.000000000000143419 ***
## neighbourhood_cleansedRenfrew-Collingwood < 0.0000000000000002 ***
## neighbourhood_cleansedSunset < 0.0000000000000002 ***
## neighbourhood_cleansedVictoria-Fraserview < 0.0000000000000002 ***
## neighbourhood_cleansedWest End 0.68913
## host_listings_count 0.000094953829834063 ***
## minimum_nights 0.000000000000000423 ***
## instant_bookablet 0.10351
## availability_30 0.000000000000574160 ***
## review_scores_rating 0.000000000000030812 ***
## reviews_per_month 0.000018214648284954 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 69.34 on 2748 degrees of freedom
## Multiple R-squared: 0.4953, Adjusted R-squared: 0.4909
## F-statistic: 112.4 on 24 and 2748 DF, p-value: < 0.00000000000000022
p = ggplot(air_clean_sig2, aes(y = air_clean_sig2$price, x = air_clean_sig2$accommodates)) + geom_point() + facet_wrap(~air_clean_sig2$bedrooms)
print(p + labs(title = "Price compared to accommodation size split by bedrooms (Model 4)", y = "Price", x = "Accommodates"))
p = ggplot(air_clean_sig2, aes(y = air_clean_sig2$price, x = air_clean_sig2$accommodates)) + geom_point() + facet_wrap(~air_clean_sig2$bathrooms) + geom_boxplot(aes(group = air_clean_sig2$accommodates))
print(p + labs(title = "Price compared to accommodation size split by bathrooms (Model 4)", y = "Price", x = "Accommodates"))
p = ggplot(air_clean_sig2, aes(y = air_clean_sig2$price, x = air_clean_sig2$bathrooms)) + geom_boxplot(aes(group = air_clean_sig2$bathrooms)) + geom_smooth()
print(p + labs(title = "Price compared to bathrooms (Model 4)", y = "Price", x = "Bathrooms"))
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
p = ggplot(air_clean_sig2, aes(y = air_clean_sig2$price, x = air_clean_sig2$number_of_reviews)) + geom_point()
print(p + labs(title = "Price compared to the number of reviews (Model 4)", y = "Price", x = "Number of reviews"))
This is the logistic model with the same data as model 1 OLS
# Fit a logistic regression model using all variables
log_model <- glm(price ~ ., data = air_clean)
summary(log_model)
##
## Call:
## glm(formula = price ~ ., data = air_clean)
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -67.172676 80.562109 -0.834
## host_is_superhostt 14.762586 6.761144 2.183
## host_response_timewithin a day -40.830748 52.200620 -0.782
## host_response_timewithin a few hours -20.547067 55.349773 -0.371
## host_response_timewithin an hour -31.411719 55.969879 -0.561
## host_response_rate -19.676596 57.839236 -0.340
## host_acceptance_rate 28.722659 21.927178 1.310
## accommodates 14.238930 2.961959 4.807
## bathrooms 62.326685 7.552568 8.252
## bedrooms 36.506232 7.421203 4.919
## neighbourhood_cleansedDowntown 20.043396 26.950665 0.744
## neighbourhood_cleansedDowntown Eastside 3.105323 29.397390 0.106
## neighbourhood_cleansedDunbar Southlands -33.022531 31.988861 -1.032
## neighbourhood_cleansedFairview -4.899654 33.135114 -0.148
## neighbourhood_cleansedGrandview-Woodland -28.848818 30.797361 -0.937
## neighbourhood_cleansedHastings-Sunrise -58.146619 30.505126 -1.906
## neighbourhood_cleansedKensington-Cedar Cottage -37.351262 29.061805 -1.285
## neighbourhood_cleansedKerrisdale -60.913923 37.472184 -1.626
## neighbourhood_cleansedKillarney -79.760936 37.600875 -2.121
## neighbourhood_cleansedKitsilano 24.745238 28.403872 0.871
## neighbourhood_cleansedMarpole -64.902130 32.760190 -1.981
## neighbourhood_cleansedMount Pleasant 30.965973 29.051691 1.066
## neighbourhood_cleansedOakridge -4.477156 35.540171 -0.126
## neighbourhood_cleansedRenfrew-Collingwood -51.634954 30.070913 -1.717
## neighbourhood_cleansedRiley Park -21.213545 29.297222 -0.724
## neighbourhood_cleansedShaughnessy -14.743869 35.741681 -0.413
## neighbourhood_cleansedSouth Cambie -24.586941 37.278204 -0.660
## neighbourhood_cleansedStrathcona -7.691959 46.654500 -0.165
## neighbourhood_cleansedSunset -56.185199 33.637982 -1.670
## neighbourhood_cleansedVictoria-Fraserview -93.818094 34.781102 -2.697
## neighbourhood_cleansedWest End 18.125489 28.792180 0.630
## neighbourhood_cleansedWest Point Grey 74.093006 36.696526 2.019
## host_listings_count 0.032812 0.100400 0.327
## minimum_nights 0.229169 0.154295 1.485
## maximum_nights 0.002245 0.006700 0.335
## instant_bookablet -8.823629 7.585355 -1.163
## host_identity_verifiedt -64.309810 17.284109 -3.721
## availability_30 1.715346 0.769031 2.231
## availability_60 -0.356230 0.759588 -0.469
## availability_90 0.085484 0.393249 0.217
## review_scores_rating 32.805207 7.791446 4.210
## reviews_per_month -4.661847 2.270830 -2.053
## has_availabilityt -15.597963 54.691575 -0.285
## availability_365 -0.005370 0.033605 -0.160
## number_of_reviews -0.026516 0.048188 -0.550
## Pr(>|t|)
## (Intercept) 0.404447
## host_is_superhostt 0.029064 *
## host_response_timewithin a day 0.434153
## host_response_timewithin a few hours 0.710493
## host_response_timewithin an hour 0.574677
## host_response_rate 0.733729
## host_acceptance_rate 0.190306
## accommodates 0.000001590 ***
## bathrooms < 0.0000000000000002 ***
## bedrooms 0.000000906 ***
## neighbourhood_cleansedDowntown 0.457100
## neighbourhood_cleansedDowntown Eastside 0.915879
## neighbourhood_cleansedDunbar Southlands 0.301991
## neighbourhood_cleansedFairview 0.882454
## neighbourhood_cleansedGrandview-Woodland 0.348957
## neighbourhood_cleansedHastings-Sunrise 0.056710 .
## neighbourhood_cleansedKensington-Cedar Cottage 0.198789
## neighbourhood_cleansedKerrisdale 0.104123
## neighbourhood_cleansedKillarney 0.033966 *
## neighbourhood_cleansedKitsilano 0.383705
## neighbourhood_cleansedMarpole 0.047649 *
## neighbourhood_cleansedMount Pleasant 0.286541
## neighbourhood_cleansedOakridge 0.899759
## neighbourhood_cleansedRenfrew-Collingwood 0.086042 .
## neighbourhood_cleansedRiley Park 0.469061
## neighbourhood_cleansedShaughnessy 0.679988
## neighbourhood_cleansedSouth Cambie 0.509581
## neighbourhood_cleansedStrathcona 0.869055
## neighbourhood_cleansedSunset 0.094945 .
## neighbourhood_cleansedVictoria-Fraserview 0.007020 **
## neighbourhood_cleansedWest End 0.529041
## neighbourhood_cleansedWest Point Grey 0.043550 *
## host_listings_count 0.743827
## minimum_nights 0.137557
## maximum_nights 0.737538
## instant_bookablet 0.244804
## host_identity_verifiedt 0.000202 ***
## availability_30 0.025771 *
## availability_60 0.639112
## availability_90 0.827924
## review_scores_rating 0.000026085 ***
## reviews_per_month 0.040149 *
## has_availabilityt 0.775508
## availability_365 0.873048
## number_of_reviews 0.582172
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 35374.87)
##
## Null deviance: 171918871 on 3817 degrees of freedom
## Residual deviance: 133469400 on 3773 degrees of freedom
## AIC: 50871
##
## Number of Fisher Scoring iterations: 2
# Perform stepwise selection on logistic regression model
logstep_model1 <- step(log_model)
## Start: AIC=50870.55
## price ~ host_is_superhost + host_response_time + host_response_rate +
## host_acceptance_rate + accommodates + bathrooms + bedrooms +
## neighbourhood_cleansed + host_listings_count + minimum_nights +
## maximum_nights + instant_bookable + host_identity_verified +
## availability_30 + availability_60 + availability_90 + review_scores_rating +
## reviews_per_month + has_availability + availability_365 +
## number_of_reviews
##
## Df Deviance AIC
## - host_response_time 3 133571339 50867
## - availability_365 1 133470304 50869
## - availability_90 1 133471072 50869
## - has_availability 1 133472278 50869
## - host_listings_count 1 133473179 50869
## - maximum_nights 1 133473374 50869
## - host_response_rate 1 133473494 50869
## - availability_60 1 133477181 50869
## - number_of_reviews 1 133480112 50869
## - instant_bookable 1 133517268 50870
## - host_acceptance_rate 1 133530099 50870
## <none> 133469400 50871
## - minimum_nights 1 133547438 50871
## - reviews_per_month 1 133618488 50873
## - host_is_superhost 1 133638048 50873
## - availability_30 1 133645399 50874
## - host_identity_verified 1 133959129 50883
## - review_scores_rating 1 134096511 50886
## - accommodates 1 134286908 50892
## - bedrooms 1 134325413 50893
## - bathrooms 1 135878494 50937
## - neighbourhood_cleansed 22 137735622 50947
##
## Step: AIC=50867.47
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + maximum_nights + instant_bookable +
## host_identity_verified + availability_30 + availability_60 +
## availability_90 + review_scores_rating + reviews_per_month +
## has_availability + availability_365 + number_of_reviews
##
## Df Deviance AIC
## - availability_90 1 133573294 50866
## - maximum_nights 1 133573980 50866
## - has_availability 1 133574128 50866
## - availability_365 1 133574155 50866
## - host_listings_count 1 133577591 50866
## - availability_60 1 133579167 50866
## - number_of_reviews 1 133581934 50866
## - host_response_rate 1 133620592 50867
## - instant_bookable 1 133625033 50867
## - host_acceptance_rate 1 133628901 50867
## <none> 133571339 50867
## - minimum_nights 1 133653270 50868
## - reviews_per_month 1 133738201 50870
## - availability_30 1 133742973 50870
## - host_is_superhost 1 133743863 50870
## - host_identity_verified 1 134053143 50879
## - review_scores_rating 1 134224542 50884
## - accommodates 1 134377074 50888
## - bedrooms 1 134432077 50890
## - bathrooms 1 135972049 50933
## - neighbourhood_cleansed 22 137816600 50943
##
## Step: AIC=50865.52
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + maximum_nights + instant_bookable +
## host_identity_verified + availability_30 + availability_60 +
## review_scores_rating + reviews_per_month + has_availability +
## availability_365 + number_of_reviews
##
## Df Deviance AIC
## - availability_365 1 133575156 50864
## - maximum_nights 1 133575940 50864
## - has_availability 1 133576130 50864
## - host_listings_count 1 133580004 50864
## - availability_60 1 133583592 50864
## - number_of_reviews 1 133584164 50864
## - host_response_rate 1 133623189 50865
## - instant_bookable 1 133627198 50865
## - host_acceptance_rate 1 133632090 50865
## <none> 133573294 50866
## - minimum_nights 1 133655707 50866
## - reviews_per_month 1 133739816 50868
## - host_is_superhost 1 133744308 50868
## - availability_30 1 133759340 50869
## - host_identity_verified 1 134054885 50877
## - review_scores_rating 1 134227040 50882
## - accommodates 1 134379388 50886
## - bedrooms 1 134432734 50888
## - bathrooms 1 135972401 50931
## - neighbourhood_cleansed 22 137816824 50941
##
## Step: AIC=50863.58
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + maximum_nights + instant_bookable +
## host_identity_verified + availability_30 + availability_60 +
## review_scores_rating + reviews_per_month + has_availability +
## number_of_reviews
##
## Df Deviance AIC
## - maximum_nights 1 133577322 50862
## - has_availability 1 133577903 50862
## - host_listings_count 1 133581960 50862
## - number_of_reviews 1 133585759 50862
## - availability_60 1 133589656 50862
## - host_response_rate 1 133624655 50863
## - instant_bookable 1 133630107 50863
## - host_acceptance_rate 1 133633893 50863
## <none> 133575156 50864
## - minimum_nights 1 133655790 50864
## - reviews_per_month 1 133740185 50866
## - host_is_superhost 1 133746269 50866
## - availability_30 1 133766211 50867
## - host_identity_verified 1 134057245 50875
## - review_scores_rating 1 134233350 50880
## - accommodates 1 134379651 50885
## - bedrooms 1 134437591 50886
## - bathrooms 1 135973181 50930
## - neighbourhood_cleansed 22 137818873 50939
##
## Step: AIC=50861.64
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + instant_bookable +
## host_identity_verified + availability_30 + availability_60 +
## review_scores_rating + reviews_per_month + has_availability +
## number_of_reviews
##
## Df Deviance AIC
## - has_availability 1 133579986 50860
## - host_listings_count 1 133584148 50860
## - number_of_reviews 1 133587109 50860
## - availability_60 1 133591581 50860
## - host_response_rate 1 133626730 50861
## - instant_bookable 1 133632663 50861
## - host_acceptance_rate 1 133636372 50861
## <none> 133577322 50862
## - minimum_nights 1 133661509 50862
## - reviews_per_month 1 133747403 50864
## - host_is_superhost 1 133750407 50865
## - availability_30 1 133767212 50865
## - host_identity_verified 1 134058286 50873
## - review_scores_rating 1 134233477 50878
## - accommodates 1 134382635 50883
## - bedrooms 1 134442466 50884
## - bathrooms 1 135973748 50928
## - neighbourhood_cleansed 22 137838435 50938
##
## Step: AIC=50859.71
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + instant_bookable +
## host_identity_verified + availability_30 + availability_60 +
## review_scores_rating + reviews_per_month + number_of_reviews
##
## Df Deviance AIC
## - host_listings_count 1 133586809 50858
## - number_of_reviews 1 133589862 50858
## - availability_60 1 133594278 50858
## - host_response_rate 1 133629227 50859
## - instant_bookable 1 133636180 50859
## - host_acceptance_rate 1 133639353 50859
## <none> 133579986 50860
## - minimum_nights 1 133663911 50860
## - reviews_per_month 1 133750961 50863
## - host_is_superhost 1 133754920 50863
## - availability_30 1 133770242 50863
## - host_identity_verified 1 134060436 50871
## - review_scores_rating 1 134234808 50876
## - accommodates 1 134387961 50881
## - bedrooms 1 134442641 50882
## - bathrooms 1 135982995 50926
## - neighbourhood_cleansed 22 137839934 50936
##
## Step: AIC=50857.91
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## minimum_nights + instant_bookable + host_identity_verified +
## availability_30 + availability_60 + review_scores_rating +
## reviews_per_month + number_of_reviews
##
## Df Deviance AIC
## - number_of_reviews 1 133597402 50856
## - availability_60 1 133600149 50856
## - host_response_rate 1 133634998 50857
## - instant_bookable 1 133640108 50857
## - host_acceptance_rate 1 133646522 50858
## <none> 133586809 50858
## - minimum_nights 1 133672157 50858
## - host_is_superhost 1 133756256 50861
## - reviews_per_month 1 133761452 50861
## - availability_30 1 133774257 50861
## - host_identity_verified 1 134062798 50869
## - review_scores_rating 1 134236528 50874
## - accommodates 1 134391163 50879
## - bedrooms 1 134455228 50881
## - bathrooms 1 135987792 50924
## - neighbourhood_cleansed 22 137902499 50935
##
## Step: AIC=50856.21
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## minimum_nights + instant_bookable + host_identity_verified +
## availability_30 + availability_60 + review_scores_rating +
## reviews_per_month
##
## Df Deviance AIC
## - availability_60 1 133609135 50855
## - host_response_rate 1 133645052 50856
## - instant_bookable 1 133646297 50856
## - host_acceptance_rate 1 133656508 50856
## <none> 133597402 50856
## - minimum_nights 1 133682957 50857
## - host_is_superhost 1 133759606 50859
## - availability_30 1 133781028 50859
## - reviews_per_month 1 133856509 50862
## - host_identity_verified 1 134082463 50868
## - review_scores_rating 1 134253891 50873
## - accommodates 1 134399744 50877
## - bedrooms 1 134472293 50879
## - bathrooms 1 136008468 50923
## - neighbourhood_cleansed 22 137924054 50934
##
## Step: AIC=50854.55
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## minimum_nights + instant_bookable + host_identity_verified +
## availability_30 + review_scores_rating + reviews_per_month
##
## Df Deviance AIC
## - host_response_rate 1 133655440 50854
## - instant_bookable 1 133659906 50854
## - host_acceptance_rate 1 133664127 50854
## <none> 133609135 50855
## - minimum_nights 1 133693885 50855
## - host_is_superhost 1 133774242 50857
## - reviews_per_month 1 133875012 50860
## - host_identity_verified 1 134098363 50867
## - availability_30 1 134135130 50868
## - review_scores_rating 1 134267861 50871
## - accommodates 1 134408179 50875
## - bedrooms 1 134486601 50878
## - bathrooms 1 136017601 50921
## - neighbourhood_cleansed 22 137931573 50932
##
## Step: AIC=50853.87
## price ~ host_is_superhost + host_acceptance_rate + accommodates +
## bathrooms + bedrooms + neighbourhood_cleansed + minimum_nights +
## instant_bookable + host_identity_verified + availability_30 +
## review_scores_rating + reviews_per_month
##
## Df Deviance AIC
## - host_acceptance_rate 1 133686097 50853
## - instant_bookable 1 133705955 50853
## <none> 133655440 50854
## - minimum_nights 1 133738786 50854
## - host_is_superhost 1 133814197 50856
## - reviews_per_month 1 133925694 50860
## - host_identity_verified 1 134151078 50866
## - availability_30 1 134200812 50867
## - review_scores_rating 1 134307304 50870
## - accommodates 1 134454315 50875
## - bedrooms 1 134531552 50877
## - bathrooms 1 136072762 50920
## - neighbourhood_cleansed 22 138011410 50932
##
## Step: AIC=50852.75
## price ~ host_is_superhost + accommodates + bathrooms + bedrooms +
## neighbourhood_cleansed + minimum_nights + instant_bookable +
## host_identity_verified + availability_30 + review_scores_rating +
## reviews_per_month
##
## Df Deviance AIC
## - instant_bookable 1 133723993 50852
## <none> 133686097 50853
## - minimum_nights 1 133759226 50853
## - host_is_superhost 1 133878636 50856
## - reviews_per_month 1 133931223 50858
## - host_identity_verified 1 134186504 50865
## - availability_30 1 134234688 50866
## - review_scores_rating 1 134337787 50869
## - accommodates 1 134514721 50874
## - bedrooms 1 134556734 50876
## - bathrooms 1 136085308 50919
## - neighbourhood_cleansed 22 138020508 50931
##
## Step: AIC=50851.83
## price ~ host_is_superhost + accommodates + bathrooms + bedrooms +
## neighbourhood_cleansed + minimum_nights + host_identity_verified +
## availability_30 + review_scores_rating + reviews_per_month
##
## Df Deviance AIC
## <none> 133723993 50852
## - minimum_nights 1 133801713 50852
## - host_is_superhost 1 133932502 50856
## - reviews_per_month 1 133993097 50858
## - host_identity_verified 1 134219758 50864
## - availability_30 1 134256600 50865
## - review_scores_rating 1 134405428 50869
## - accommodates 1 134537720 50873
## - bedrooms 1 134605798 50875
## - bathrooms 1 136121939 50918
## - neighbourhood_cleansed 22 138042976 50929
# Summary of the logistic regression model with stepwise selection
summary(logstep_model1)
##
## Call:
## glm(formula = price ~ host_is_superhost + accommodates + bathrooms +
## bedrooms + neighbourhood_cleansed + minimum_nights + host_identity_verified +
## availability_30 + review_scores_rating + reviews_per_month,
## data = air_clean)
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -112.0849 49.0409 -2.286
## host_is_superhostt 15.7748 6.4926 2.430
## accommodates 14.0895 2.9354 4.800
## bathrooms 62.0460 7.5302 8.240
## bedrooms 36.9237 7.3898 4.997
## neighbourhood_cleansedDowntown 19.7935 26.8794 0.736
## neighbourhood_cleansedDowntown Eastside 4.1203 29.3261 0.140
## neighbourhood_cleansedDunbar Southlands -33.0215 31.8826 -1.036
## neighbourhood_cleansedFairview -4.7947 33.0391 -0.145
## neighbourhood_cleansedGrandview-Woodland -29.9364 30.6737 -0.976
## neighbourhood_cleansedHastings-Sunrise -57.0590 30.4421 -1.874
## neighbourhood_cleansedKensington-Cedar Cottage -38.7957 28.9638 -1.339
## neighbourhood_cleansedKerrisdale -60.4145 37.4025 -1.615
## neighbourhood_cleansedKillarney -80.8849 37.4822 -2.158
## neighbourhood_cleansedKitsilano 23.9052 28.3344 0.844
## neighbourhood_cleansedMarpole -65.3931 32.7020 -2.000
## neighbourhood_cleansedMount Pleasant 31.4575 29.0042 1.085
## neighbourhood_cleansedOakridge -5.1763 35.4228 -0.146
## neighbourhood_cleansedRenfrew-Collingwood -52.4513 29.9687 -1.750
## neighbourhood_cleansedRiley Park -20.5983 29.2187 -0.705
## neighbourhood_cleansedShaughnessy -20.1101 35.3966 -0.568
## neighbourhood_cleansedSouth Cambie -24.6347 37.1773 -0.663
## neighbourhood_cleansedStrathcona -8.0929 46.5123 -0.174
## neighbourhood_cleansedSunset -56.7506 33.5279 -1.693
## neighbourhood_cleansedVictoria-Fraserview -92.5127 34.6655 -2.669
## neighbourhood_cleansedWest End 18.6769 28.6604 0.652
## neighbourhood_cleansedWest Point Grey 70.9888 36.6270 1.938
## minimum_nights 0.2237 0.1508 1.483
## host_identity_verifiedt -64.4253 17.1962 -3.746
## availability_30 1.2442 0.3204 3.883
## review_scores_rating 33.7967 7.6944 4.392
## reviews_per_month -5.4174 1.9627 -2.760
## Pr(>|t|)
## (Intercept) 0.022336 *
## host_is_superhostt 0.015159 *
## accommodates 0.000001649396864246 ***
## bathrooms 0.000000000000000236 ***
## bedrooms 0.000000609972329033 ***
## neighbourhood_cleansedDowntown 0.461544
## neighbourhood_cleansedDowntown Eastside 0.888273
## neighbourhood_cleansedDunbar Southlands 0.300399
## neighbourhood_cleansedFairview 0.884622
## neighbourhood_cleansedGrandview-Woodland 0.329146
## neighbourhood_cleansedHastings-Sunrise 0.060960 .
## neighbourhood_cleansedKensington-Cedar Cottage 0.180503
## neighbourhood_cleansedKerrisdale 0.106339
## neighbourhood_cleansedKillarney 0.030994 *
## neighbourhood_cleansedKitsilano 0.398902
## neighbourhood_cleansedMarpole 0.045608 *
## neighbourhood_cleansedMount Pleasant 0.278176
## neighbourhood_cleansedOakridge 0.883828
## neighbourhood_cleansedRenfrew-Collingwood 0.080164 .
## neighbourhood_cleansedRiley Park 0.480872
## neighbourhood_cleansedShaughnessy 0.569976
## neighbourhood_cleansedSouth Cambie 0.507610
## neighbourhood_cleansedStrathcona 0.861878
## neighbourhood_cleansedSunset 0.090606 .
## neighbourhood_cleansedVictoria-Fraserview 0.007646 **
## neighbourhood_cleansedWest End 0.514659
## neighbourhood_cleansedWest Point Grey 0.052678 .
## minimum_nights 0.138057
## host_identity_verifiedt 0.000182 ***
## availability_30 0.000105 ***
## review_scores_rating 0.000011519513171064 ***
## reviews_per_month 0.005804 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 35320.65)
##
## Null deviance: 171918871 on 3817 degrees of freedom
## Residual deviance: 133723993 on 3786 degrees of freedom
## AIC: 50852
##
## Number of Fisher Scoring iterations: 2
p = ggplot(log_model, aes(y = log_model$model$price, x = log_model$model$accommodates)) + geom_point() + facet_wrap(~log_model$model$bedrooms)
print(p + labs(title = "Price compared to accommodation size split by bedrooms (Log Model 1)", y = "Price", x = "Accommodates"))
p = ggplot(log_model, aes(y = log_model$model$price, x = log_model$model$accommodates)) + geom_point() + facet_wrap(~log_model$model$bathrooms) + geom_boxplot(aes(group = log_model$model$accommodates))
print(p + labs(title = "Price compared to accommodation size split by bathrooms (Log Model 1)", y = "Price", x = "Accommodates"))
p = ggplot(log_model, aes(y = log_model$model$price, x = log_model$model$bathrooms)) + geom_boxplot(aes(group = log_model$model$bathrooms))
print(p + labs(title = "Price compared to bathrooms (Log Model 1)", y = "Price", x = "Bathrooms"))
p = ggplot(log_model, aes(y = log_model$model$price, x = log_model$model$number_of_reviews)) + geom_point()
print(p + labs(title = "Price compared to the number of reviews (Log Model 1)", y = "Price", x = "Number of reviews"))
Logistic model with the same data as the model 4
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
# Fit a logistic regression model using filtered data and significant neighbourhoods
log_model2 <- glm(price ~ ., data = air_clean_filtered)
summary(log_model2)
##
## Call:
## glm(formula = price ~ ., data = air_clean_filtered)
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -35.354771 29.905671 -1.182
## host_is_superhostt 14.320777 2.509036 5.708
## host_response_timewithin a day -22.579428 19.995827 -1.129
## host_response_timewithin a few hours -20.581804 21.175437 -0.972
## host_response_timewithin an hour -19.771795 21.398596 -0.924
## host_response_rate -20.569007 21.858076 -0.941
## host_acceptance_rate 15.377913 8.166207 1.883
## accommodates 10.729022 1.109471 9.670
## bathrooms 30.678979 2.889193 10.619
## bedrooms 37.140235 2.800603 13.262
## neighbourhood_cleansedDowntown 39.845624 10.207125 3.904
## neighbourhood_cleansedDowntown Eastside 25.293042 11.088682 2.281
## neighbourhood_cleansedDunbar Southlands -13.617895 12.062133 -1.129
## neighbourhood_cleansedFairview 23.965913 12.439301 1.927
## neighbourhood_cleansedGrandview-Woodland -2.465421 11.591317 -0.213
## neighbourhood_cleansedHastings-Sunrise -31.576074 11.482826 -2.750
## neighbourhood_cleansedKensington-Cedar Cottage -19.747292 10.985093 -1.798
## neighbourhood_cleansedKerrisdale -47.166174 14.157352 -3.332
## neighbourhood_cleansedKillarney -51.747185 14.060599 -3.680
## neighbourhood_cleansedKitsilano 41.462154 10.746738 3.858
## neighbourhood_cleansedMarpole -36.700292 12.301660 -2.983
## neighbourhood_cleansedMount Pleasant 17.959586 10.971467 1.637
## neighbourhood_cleansedOakridge -32.875063 13.534723 -2.429
## neighbourhood_cleansedRenfrew-Collingwood -31.193802 11.338279 -2.751
## neighbourhood_cleansedRiley Park -4.467922 11.073163 -0.403
## neighbourhood_cleansedShaughnessy -14.477791 13.493965 -1.073
## neighbourhood_cleansedSouth Cambie -2.689217 13.939127 -0.193
## neighbourhood_cleansedStrathcona 19.186889 17.353452 1.106
## neighbourhood_cleansedSunset -32.634434 12.651370 -2.580
## neighbourhood_cleansedVictoria-Fraserview -59.517197 13.035596 -4.566
## neighbourhood_cleansedWest End 38.655672 10.875146 3.554
## neighbourhood_cleansedWest Point Grey 18.198157 14.135005 1.287
## host_listings_count 0.116980 0.037061 3.156
## minimum_nights -0.645555 0.059340 -10.879
## maximum_nights -0.002560 0.002487 -1.029
## instant_bookablet -6.340251 2.814059 -2.253
## host_identity_verifiedt -3.155202 6.490253 -0.486
## availability_30 1.162535 0.285523 4.072
## availability_60 -0.407248 0.280735 -1.451
## availability_90 0.084992 0.145345 0.585
## review_scores_rating 24.291880 2.879319 8.437
## reviews_per_month -4.186840 0.841474 -4.976
## has_availabilityt -23.901324 20.165539 -1.185
## availability_365 0.037347 0.012469 2.995
## number_of_reviews -0.043240 0.017794 -2.430
## Pr(>|t|)
## (Intercept) 0.237198
## host_is_superhostt 0.0000000123 ***
## host_response_timewithin a day 0.258883
## host_response_timewithin a few hours 0.331131
## host_response_timewithin an hour 0.355559
## host_response_rate 0.346753
## host_acceptance_rate 0.059763 .
## accommodates < 0.0000000000000002 ***
## bathrooms < 0.0000000000000002 ***
## bedrooms < 0.0000000000000002 ***
## neighbourhood_cleansedDowntown 0.0000964083 ***
## neighbourhood_cleansedDowntown Eastside 0.022606 *
## neighbourhood_cleansedDunbar Southlands 0.258979
## neighbourhood_cleansedFairview 0.054102 .
## neighbourhood_cleansedGrandview-Woodland 0.831576
## neighbourhood_cleansedHastings-Sunrise 0.005991 **
## neighbourhood_cleansedKensington-Cedar Cottage 0.072314 .
## neighbourhood_cleansedKerrisdale 0.000872 ***
## neighbourhood_cleansedKillarney 0.000236 ***
## neighbourhood_cleansedKitsilano 0.000116 ***
## neighbourhood_cleansedMarpole 0.002870 **
## neighbourhood_cleansedMount Pleasant 0.101728
## neighbourhood_cleansedOakridge 0.015190 *
## neighbourhood_cleansedRenfrew-Collingwood 0.005967 **
## neighbourhood_cleansedRiley Park 0.686610
## neighbourhood_cleansedShaughnessy 0.283382
## neighbourhood_cleansedSouth Cambie 0.847028
## neighbourhood_cleansedStrathcona 0.268948
## neighbourhood_cleansedSunset 0.009932 **
## neighbourhood_cleansedVictoria-Fraserview 0.0000051382 ***
## neighbourhood_cleansedWest End 0.000383 ***
## neighbourhood_cleansedWest Point Grey 0.198016
## host_listings_count 0.001610 **
## minimum_nights < 0.0000000000000002 ***
## maximum_nights 0.303335
## instant_bookablet 0.024313 *
## host_identity_verifiedt 0.626893
## availability_30 0.0000476631 ***
## availability_60 0.146962
## availability_90 0.558746
## review_scores_rating < 0.0000000000000002 ***
## reviews_per_month 0.0000006799 ***
## has_availabilityt 0.235992
## availability_365 0.002761 **
## number_of_reviews 0.015144 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 4808.166)
##
## Null deviance: 37880796 on 3766 degrees of freedom
## Residual deviance: 17895994 on 3722 degrees of freedom
## AIC: 42674
##
## Number of Fisher Scoring iterations: 2
# Perform stepwise selection on the logistic regression model
logstep_model2 <- step(log_model2)
## Start: AIC=42673.91
## price ~ host_is_superhost + host_response_time + host_response_rate +
## host_acceptance_rate + accommodates + bathrooms + bedrooms +
## neighbourhood_cleansed + host_listings_count + minimum_nights +
## maximum_nights + instant_bookable + host_identity_verified +
## availability_30 + availability_60 + availability_90 + review_scores_rating +
## reviews_per_month + has_availability + availability_365 +
## number_of_reviews
##
## Df Deviance AIC
## - host_response_time 3 17904216 42670
## - host_identity_verified 1 17897130 42672
## - availability_90 1 17897638 42672
## - host_response_rate 1 17900252 42673
## - maximum_nights 1 17901089 42673
## - has_availability 1 17902749 42673
## <none> 17895994 42674
## - availability_60 1 17906112 42674
## - host_acceptance_rate 1 17913044 42675
## - instant_bookable 1 17920402 42677
## - number_of_reviews 1 17924387 42678
## - availability_365 1 17939129 42681
## - host_listings_count 1 17943898 42682
## - availability_30 1 17975703 42689
## - reviews_per_month 1 18015028 42697
## - host_is_superhost 1 18052632 42705
## - review_scores_rating 1 18238227 42743
## - accommodates 1 18345637 42765
## - bathrooms 1 18438130 42784
## - minimum_nights 1 18465041 42790
## - bedrooms 1 18741595 42846
## - neighbourhood_cleansed 22 21182809 43265
##
## Step: AIC=42669.64
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + maximum_nights + instant_bookable +
## host_identity_verified + availability_30 + availability_60 +
## availability_90 + review_scores_rating + reviews_per_month +
## has_availability + availability_365 + number_of_reviews
##
## Df Deviance AIC
## - host_identity_verified 1 17905433 42668
## - availability_90 1 17906086 42668
## - maximum_nights 1 17909374 42669
## - has_availability 1 17911190 42669
## <none> 17904216 42670
## - availability_60 1 17914440 42670
## - host_acceptance_rate 1 17925400 42672
## - instant_bookable 1 17926932 42672
## - number_of_reviews 1 17933049 42674
## - availability_365 1 17944967 42676
## - host_response_rate 1 17946279 42676
## - host_listings_count 1 17953052 42678
## - availability_30 1 17982220 42684
## - reviews_per_month 1 18022103 42692
## - host_is_superhost 1 18065616 42701
## - review_scores_rating 1 18252579 42740
## - accommodates 1 18356689 42762
## - bathrooms 1 18444964 42780
## - minimum_nights 1 18480067 42787
## - bedrooms 1 18749843 42841
## - neighbourhood_cleansed 22 21184640 43259
##
## Step: AIC=42667.89
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + maximum_nights + instant_bookable +
## availability_30 + availability_60 + availability_90 + review_scores_rating +
## reviews_per_month + has_availability + availability_365 +
## number_of_reviews
##
## Df Deviance AIC
## - availability_90 1 17907285 42666
## - maximum_nights 1 17910698 42667
## - has_availability 1 17912362 42667
## <none> 17905433 42668
## - availability_60 1 17915726 42668
## - host_acceptance_rate 1 17926842 42670
## - instant_bookable 1 17928032 42671
## - number_of_reviews 1 17934989 42672
## - availability_365 1 17946216 42674
## - host_response_rate 1 17947810 42675
## - host_listings_count 1 17953646 42676
## - availability_30 1 17983819 42682
## - reviews_per_month 1 18023110 42691
## - host_is_superhost 1 18065655 42699
## - review_scores_rating 1 18254467 42739
## - accommodates 1 18357774 42760
## - bathrooms 1 18446500 42778
## - minimum_nights 1 18480963 42785
## - bedrooms 1 18750862 42840
## - neighbourhood_cleansed 22 21185347 43258
##
## Step: AIC=42666.28
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + maximum_nights + instant_bookable +
## availability_30 + availability_60 + review_scores_rating +
## reviews_per_month + has_availability + availability_365 +
## number_of_reviews
##
## Df Deviance AIC
## - maximum_nights 1 17912531 42665
## - has_availability 1 17914286 42666
## <none> 17907285 42666
## - availability_60 1 17924060 42668
## - host_acceptance_rate 1 17929423 42669
## - instant_bookable 1 17930013 42669
## - number_of_reviews 1 17937286 42671
## - host_response_rate 1 17950193 42673
## - availability_365 1 17955830 42674
## - host_listings_count 1 17956824 42675
## - availability_30 1 17988953 42681
## - reviews_per_month 1 18024713 42689
## - host_is_superhost 1 18066061 42698
## - review_scores_rating 1 18256676 42737
## - accommodates 1 18359880 42758
## - bathrooms 1 18447599 42776
## - minimum_nights 1 18481995 42783
## - bedrooms 1 18751476 42838
## - neighbourhood_cleansed 22 21185408 43256
##
## Step: AIC=42665.38
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + instant_bookable +
## availability_30 + availability_60 + review_scores_rating +
## reviews_per_month + has_availability + availability_365 +
## number_of_reviews
##
## Df Deviance AIC
## - has_availability 1 17919772 42665
## <none> 17912531 42665
## - availability_60 1 17929040 42667
## - host_acceptance_rate 1 17934381 42668
## - instant_bookable 1 17934786 42668
## - number_of_reviews 1 17945472 42670
## - host_response_rate 1 17955627 42672
## - availability_365 1 17957932 42673
## - host_listings_count 1 17961881 42674
## - availability_30 1 17995236 42681
## - reviews_per_month 1 18026571 42687
## - host_is_superhost 1 18069058 42696
## - review_scores_rating 1 18265553 42737
## - accommodates 1 18364841 42757
## - bathrooms 1 18454432 42776
## - minimum_nights 1 18500055 42785
## - bedrooms 1 18753719 42836
## - neighbourhood_cleansed 22 21186606 43254
##
## Step: AIC=42664.91
## price ~ host_is_superhost + host_response_rate + host_acceptance_rate +
## accommodates + bathrooms + bedrooms + neighbourhood_cleansed +
## host_listings_count + minimum_nights + instant_bookable +
## availability_30 + availability_60 + review_scores_rating +
## reviews_per_month + availability_365 + number_of_reviews
##
## Df Deviance AIC
## <none> 17919772 42665
## - availability_60 1 17936455 42666
## - host_acceptance_rate 1 17941926 42668
## - instant_bookable 1 17942920 42668
## - number_of_reviews 1 17952956 42670
## - host_response_rate 1 17962586 42672
## - availability_365 1 17965835 42673
## - host_listings_count 1 17969121 42673
## - availability_30 1 18002958 42680
## - reviews_per_month 1 18034916 42687
## - host_is_superhost 1 18079095 42696
## - review_scores_rating 1 18271090 42736
## - accommodates 1 18375007 42757
## - bathrooms 1 18465918 42776
## - minimum_nights 1 18508838 42785
## - bedrooms 1 18755936 42835
## - neighbourhood_cleansed 22 21192590 43253
# Summary of the logistic regression model with stepwise selection
summary(logstep_model2)
##
## Call:
## glm(formula = price ~ host_is_superhost + host_response_rate +
## host_acceptance_rate + accommodates + bathrooms + bedrooms +
## neighbourhood_cleansed + host_listings_count + minimum_nights +
## instant_bookable + availability_30 + availability_60 + review_scores_rating +
## reviews_per_month + availability_365 + number_of_reviews,
## data = air_clean_filtered)
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -69.91018 20.69287 -3.378
## host_is_superhostt 14.33211 2.48909 5.758
## host_response_rate -36.43895 12.20789 -2.985
## host_acceptance_rate 16.86872 7.85642 2.147
## accommodates 10.78229 1.10781 9.733
## bathrooms 30.76770 2.88610 10.661
## bedrooms 36.87459 2.79545 13.191
## neighbourhood_cleansedDowntown 39.74247 10.20097 3.896
## neighbourhood_cleansedDowntown Eastside 25.41006 11.08268 2.293
## neighbourhood_cleansedDunbar Southlands -12.75132 12.04567 -1.059
## neighbourhood_cleansedFairview 23.47804 12.42448 1.890
## neighbourhood_cleansedGrandview-Woodland -1.79853 11.58297 -0.155
## neighbourhood_cleansedHastings-Sunrise -31.50081 11.47586 -2.745
## neighbourhood_cleansedKensington-Cedar Cottage -19.38871 10.97832 -1.766
## neighbourhood_cleansedKerrisdale -47.40085 14.15218 -3.349
## neighbourhood_cleansedKillarney -51.28220 14.05060 -3.650
## neighbourhood_cleansedKitsilano 41.30882 10.74097 3.846
## neighbourhood_cleansedMarpole -37.11291 12.29200 -3.019
## neighbourhood_cleansedMount Pleasant 18.08668 10.96627 1.649
## neighbourhood_cleansedOakridge -33.24005 13.50823 -2.461
## neighbourhood_cleansedRenfrew-Collingwood -30.79953 11.32928 -2.719
## neighbourhood_cleansedRiley Park -4.25545 11.06709 -0.385
## neighbourhood_cleansedShaughnessy -13.68428 13.45162 -1.017
## neighbourhood_cleansedSouth Cambie -2.41965 13.92502 -0.174
## neighbourhood_cleansedStrathcona 18.99607 17.33737 1.096
## neighbourhood_cleansedSunset -32.22891 12.64232 -2.549
## neighbourhood_cleansedVictoria-Fraserview -58.82026 13.02703 -4.515
## neighbourhood_cleansedWest End 38.35129 10.86815 3.529
## neighbourhood_cleansedWest Point Grey 18.39524 14.12524 1.302
## host_listings_count 0.11816 0.03687 3.205
## minimum_nights -0.65345 0.05902 -11.072
## instant_bookablet -6.12523 2.79082 -2.195
## availability_30 1.09260 0.26261 4.161
## availability_60 -0.25855 0.13876 -1.863
## review_scores_rating 24.48425 2.86356 8.550
## reviews_per_month -4.05327 0.82805 -4.895
## availability_365 0.03690 0.01192 3.096
## number_of_reviews -0.04641 0.01766 -2.628
## Pr(>|t|)
## (Intercept) 0.000736 ***
## host_is_superhostt 0.0000000092 ***
## host_response_rate 0.002855 **
## host_acceptance_rate 0.031848 *
## accommodates < 0.0000000000000002 ***
## bathrooms < 0.0000000000000002 ***
## bedrooms < 0.0000000000000002 ***
## neighbourhood_cleansedDowntown 0.0000995303 ***
## neighbourhood_cleansedDowntown Eastside 0.021916 *
## neighbourhood_cleansedDunbar Southlands 0.289859
## neighbourhood_cleansedFairview 0.058881 .
## neighbourhood_cleansedGrandview-Woodland 0.876614
## neighbourhood_cleansedHastings-Sunrise 0.006081 **
## neighbourhood_cleansedKensington-Cedar Cottage 0.077462 .
## neighbourhood_cleansedKerrisdale 0.000818 ***
## neighbourhood_cleansedKillarney 0.000266 ***
## neighbourhood_cleansedKitsilano 0.000122 ***
## neighbourhood_cleansedMarpole 0.002551 **
## neighbourhood_cleansedMount Pleasant 0.099170 .
## neighbourhood_cleansedOakridge 0.013911 *
## neighbourhood_cleansedRenfrew-Collingwood 0.006587 **
## neighbourhood_cleansedRiley Park 0.700619
## neighbourhood_cleansedShaughnessy 0.309079
## neighbourhood_cleansedSouth Cambie 0.862061
## neighbourhood_cleansedStrathcona 0.273293
## neighbourhood_cleansedSunset 0.010834 *
## neighbourhood_cleansedVictoria-Fraserview 0.0000065198 ***
## neighbourhood_cleansedWest End 0.000423 ***
## neighbourhood_cleansedWest Point Grey 0.192896
## host_listings_count 0.001364 **
## minimum_nights < 0.0000000000000002 ***
## instant_bookablet 0.028241 *
## availability_30 0.0000324573 ***
## availability_60 0.062509 .
## review_scores_rating < 0.0000000000000002 ***
## reviews_per_month 0.0000010248 ***
## availability_365 0.001976 **
## number_of_reviews 0.008629 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 4805.517)
##
## Null deviance: 37880796 on 3766 degrees of freedom
## Residual deviance: 17919772 on 3729 degrees of freedom
## AIC: 42665
##
## Number of Fisher Scoring iterations: 2
# Create a list of your models
logmodels_list <- list(logstep_model1, logstep_model2)
stargazer(logmodels_list, type = "text", title = "Model Comparison",
column.labels = c("Log Model 1", "Log Model 2"),
header = FALSE, single.row = TRUE, font.size = "small")
##
## Model Comparison
## ======================================================================================
## Dependent variable:
## ---------------------------------------
## price
## Log Model 1 Log Model 2
## (1) (2)
## --------------------------------------------------------------------------------------
## host_is_superhostt 15.775** (6.493) 14.332*** (2.489)
## host_response_rate -36.439*** (12.208)
## host_acceptance_rate 16.869** (7.856)
## accommodates 14.090*** (2.935) 10.782*** (1.108)
## bathrooms 62.046*** (7.530) 30.768*** (2.886)
## bedrooms 36.924*** (7.390) 36.875*** (2.795)
## neighbourhood_cleansedDowntown 19.793 (26.879) 39.742*** (10.201)
## neighbourhood_cleansedDowntown Eastside 4.120 (29.326) 25.410** (11.083)
## neighbourhood_cleansedDunbar Southlands -33.021 (31.883) -12.751 (12.046)
## neighbourhood_cleansedFairview -4.795 (33.039) 23.478* (12.424)
## neighbourhood_cleansedGrandview-Woodland -29.936 (30.674) -1.799 (11.583)
## neighbourhood_cleansedHastings-Sunrise -57.059* (30.442) -31.501*** (11.476)
## neighbourhood_cleansedKensington-Cedar Cottage -38.796 (28.964) -19.389* (10.978)
## neighbourhood_cleansedKerrisdale -60.415 (37.403) -47.401*** (14.152)
## neighbourhood_cleansedKillarney -80.885** (37.482) -51.282*** (14.051)
## neighbourhood_cleansedKitsilano 23.905 (28.334) 41.309*** (10.741)
## neighbourhood_cleansedMarpole -65.393** (32.702) -37.113*** (12.292)
## neighbourhood_cleansedMount Pleasant 31.457 (29.004) 18.087* (10.966)
## neighbourhood_cleansedOakridge -5.176 (35.423) -33.240** (13.508)
## neighbourhood_cleansedRenfrew-Collingwood -52.451* (29.969) -30.800*** (11.329)
## neighbourhood_cleansedRiley Park -20.598 (29.219) -4.255 (11.067)
## neighbourhood_cleansedShaughnessy -20.110 (35.397) -13.684 (13.452)
## neighbourhood_cleansedSouth Cambie -24.635 (37.177) -2.420 (13.925)
## neighbourhood_cleansedStrathcona -8.093 (46.512) 18.996 (17.337)
## neighbourhood_cleansedSunset -56.751* (33.528) -32.229** (12.642)
## neighbourhood_cleansedVictoria-Fraserview -92.513*** (34.665) -58.820*** (13.027)
## neighbourhood_cleansedWest End 18.677 (28.660) 38.351*** (10.868)
## neighbourhood_cleansedWest Point Grey 70.989* (36.627) 18.395 (14.125)
## host_listings_count 0.118*** (0.037)
## minimum_nights 0.224 (0.151) -0.653*** (0.059)
## host_identity_verifiedt -64.425*** (17.196)
## instant_bookablet -6.125** (2.791)
## availability_30 1.244*** (0.320) 1.093*** (0.263)
## availability_60 -0.259* (0.139)
## review_scores_rating 33.797*** (7.694) 24.484*** (2.864)
## reviews_per_month -5.417*** (1.963) -4.053*** (0.828)
## availability_365 0.037*** (0.012)
## number_of_reviews -0.046*** (0.018)
## Constant -112.085** (49.041) -69.910*** (20.693)
## --------------------------------------------------------------------------------------
## Observations 3,818 3,767
## Log Likelihood -25,393.910 -21,294.450
## Akaike Inf. Crit. 50,851.830 42,664.910
## ======================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
p = ggplot(log_model2, aes(y = log_model2$model$price, x = log_model2$model$accommodates)) + geom_point() + facet_wrap(~log_model2$model$bedrooms)
print(p + labs(title = "Price compared to accommodation size split by bedrooms (Log Model 2)", y = "Price", x = "Accommodates"))
p = ggplot(log_model2, aes(y = log_model2$model$price, x = log_model2$model$accommodates)) + geom_point() + facet_wrap(~log_model2$model$bathrooms) + geom_boxplot(aes(group = log_model2$model$accommodates))
print(p + labs(title = "Price compared to accommodation size split by bathrooms (Log Model 2)", y = "Price", x = "Accommodates"))
p = ggplot(log_model2, aes(y = log_model2$model$price, x = log_model2$model$bathrooms)) + geom_boxplot(aes(group = log_model2$model$bathrooms)) + geom_smooth()
print(p + labs(title = "Price compared to bathrooms (Log Model 2)", y = "Price", x = "Bathrooms"))
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
p = ggplot(log_model2, aes(y = log_model2$model$price, x = log_model2$model$number_of_reviews)) + geom_point()
print(p + labs(title = "Price compared to the number of reviews (Log Model 2)", y = "Price", x = "Number of reviews"))
Comparing of Models
# Load the stargazer package
library(stargazer)
# Create a list of your models
models_list <- list(step_model, step_model2, step_model3)
# Display the comparison using stargazer with smaller output
stargazer(models_list, type = "text", title = "Model Comparison",
column.labels = c("Model 1", "Model 2", "Model 3"),
header = FALSE, single.row = TRUE, font.size = "small")
##
## Model Comparison
## =============================================================================================================================
## Dependent variable:
## ------------------------------------------------------------------------------
## price
## Model 1 Model 2 Model 3
## (1) (2) (3)
## -----------------------------------------------------------------------------------------------------------------------------
## host_is_superhostt 15.775** (6.493) 15.802* (8.254) 13.853*** (2.483)
## host_response_rate -36.381*** (12.233)
## host_acceptance_rate 16.201** (7.881)
## accommodates 14.090*** (2.935) 13.486*** (3.702) 10.949*** (1.108)
## bathrooms 62.046*** (7.530) 67.610*** (10.031) 31.087*** (2.890)
## bedrooms 36.924*** (7.390) 30.971*** (9.488) 36.986*** (2.799)
## neighbourhood_cleansedDowntown 19.793 (26.879) 40.471*** (10.215)
## neighbourhood_cleansedDowntown Eastside 4.120 (29.326) -17.499 (15.841) 26.150** (11.101)
## neighbourhood_cleansedDunbar Southlands -33.021 (31.883) -11.687 (12.065)
## neighbourhood_cleansedFairview -4.795 (33.039) 23.800* (12.447)
## neighbourhood_cleansedGrandview-Woodland -29.936 (30.674) -2.725 (11.599)
## neighbourhood_cleansedHastings-Sunrise -57.059* (30.442) -73.211*** (18.515) -31.742*** (11.497)
## neighbourhood_cleansedKensington-Cedar Cottage -38.796 (28.964) -18.534* (10.996)
## neighbourhood_cleansedKerrisdale -60.415 (37.403) -74.843** (30.507) -48.284*** (14.172)
## neighbourhood_cleansedKillarney -80.885** (37.482) -92.231*** (30.614) -49.232*** (14.067)
## neighbourhood_cleansedKitsilano 23.905 (28.334) 9.953 (13.803) 42.071*** (10.760)
## neighbourhood_cleansedMarpole -65.393** (32.702) -81.246*** (22.617) -36.482*** (12.312)
## neighbourhood_cleansedMount Pleasant 31.457 (29.004) 15.946 (15.170) 18.553* (10.983)
## neighbourhood_cleansedOakridge -5.176 (35.423) -14.613 (27.603) -33.292** (13.529)
## neighbourhood_cleansedRenfrew-Collingwood -52.451* (29.969) -66.994*** (17.539) -31.234*** (11.348)
## neighbourhood_cleansedRiley Park -20.598 (29.219) -3.551 (11.085)
## neighbourhood_cleansedShaughnessy -20.110 (35.397) -13.967 (13.471)
## neighbourhood_cleansedSouth Cambie -24.635 (37.177) -2.499 (13.949)
## neighbourhood_cleansedStrathcona -8.093 (46.512) 19.798 (17.362)
## neighbourhood_cleansedSunset -56.751* (33.528) -70.423*** (24.190) -32.036** (12.662)
## neighbourhood_cleansedVictoria-Fraserview -92.513*** (34.665) -107.547*** (26.302) -58.745*** (13.042)
## neighbourhood_cleansedWest End 18.677 (28.660) -0.273 (14.134) 38.205*** (10.883)
## neighbourhood_cleansedWest Point Grey 70.989* (36.627) 18.030 (14.151)
## host_listings_count 0.117*** (0.037)
## minimum_nights 0.224 (0.151) 0.587*** (0.170) -0.632*** (0.059)
## instant_bookablet -13.185 (9.303) -4.976* (2.778)
## host_identity_verifiedt -64.425*** (17.196) -64.441*** (23.422)
## availability_30 1.244*** (0.320) 1.474*** (0.419) 1.155*** (0.285)
## availability_60 -0.472* (0.280)
## availability_90 0.210 (0.140)
## review_scores_rating 33.797*** (7.694) 33.159*** (9.411) 24.363*** (2.863)
## reviews_per_month -5.417*** (1.963) -5.131*** (0.752)
## Constant -112.085** (49.041) -102.057** (51.708) -69.213*** (20.637)
## -----------------------------------------------------------------------------------------------------------------------------
## Observations 3,818 2,803 3,767
## R2 0.222 0.172 0.525
## Adjusted R2 0.216 0.166 0.521
## Residual Std. Error 187.938 (df = 3786) 207.711 (df = 2781) 69.448 (df = 3730)
## F Statistic 34.883*** (df = 31; 3786) 27.466*** (df = 21; 2781) 114.561*** (df = 36; 3730)
## =============================================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
options(scipen = 999)
# Diagnostic plot: Residuals vs Fitted
residuals_vs_fitted <- ggplot(air_clean_filtered, aes(x = fitted(step_model3), y = residuals(step_model3))) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
xlab("Fitted Values") +
ylab("Residuals") +
ggtitle("Residuals vs Fitted Model 3")
# Show the plot
print(residuals_vs_fitted)
# Get the residuals from the model
residuals <- residuals(step_model3)
# Create a histogram of the residuals
hist(residuals, breaks = 30, main = "Histogram of Residuals OLS Model 3", xlab = "Residuals")
# Diagnostic plot: Residuals vs Fitted
residuals_vs_fitted2 <- ggplot(air_clean, aes(x = fitted(step_model), y = residuals(step_model))) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
xlab("Fitted Values") +
ylab("Residuals") +
ggtitle("Residuals vs Fitted Model 1")
# Show the plot
print(residuals_vs_fitted2)
# Get the residuals from the model
residuals2 <- residuals(step_model)
# Create a histogram of the residuals
hist(residuals2, breaks = 30, main = "Histogram of Residuals OLS Model 1", xlab = "Residuals")
library(rpart)
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
# Calculate the median price
median_price <- median(air_clean_filtered$price)
# Create a binary variable based on median price
air_clean_filtered$price_binary <- ifelse(air_clean_filtered$price > median_price, "Above Median", "Below Median")
# Convert character columns to factors in the dataset
air_clean_filtered <- air_clean_filtered %>% mutate_if(is.character, as.factor)
# Set the seed for reproducibility
set.seed(123)
# Randomize the order of the data
air_clean_filtered <- air_clean_filtered[sample(nrow(air_clean_filtered)), ]
# Split the data into training (70%) and testing (30%) sets
train_index <- createDataPartition(air_clean_filtered$price, p = 0.7, list = FALSE)
train_data <- air_clean_filtered[train_index, ]
test_data <- air_clean_filtered[-train_index, ]
# Fit the decision tree model using the training data
tree_model <- rpart(price_binary ~ host_is_superhost + host_response_time +
host_response_rate + host_acceptance_rate +
accommodates + bathrooms + bedrooms +
neighbourhood_cleansed + host_listings_count +
minimum_nights + maximum_nights + instant_bookable +
host_identity_verified + availability_30 +
availability_60 + availability_90 +
review_scores_rating + reviews_per_month , data = train_data)
# Visualize the decision tree
rpart.plot(tree_model)
# Convert character columns to factors in the dataset
air_clean_filtered <- air_clean_filtered %>% mutate_if(is.character, as.factor)
# Get the variable importance from the model
variable_importance <- tree_model$variable.importance
# Calculate the total importance sum
total_importance <- sum(variable_importance)
# Calculate the percentage importance for each variable
percentage_importance <- (variable_importance / total_importance) * 100
# Sort variable importance in descending order
sorted_percentage <- sort(percentage_importance, decreasing = TRUE)
# Print sorted percentage importance
print(sorted_percentage)
## bedrooms accommodates bathrooms
## 25.35976185 19.72044362 17.29235380
## neighbourhood_cleansed minimum_nights reviews_per_month
## 15.47136343 6.80647157 4.97305337
## host_acceptance_rate availability_30 host_listings_count
## 2.24625502 2.01425397 1.87239199
## host_response_time availability_60 availability_90
## 1.84168917 0.99998840 0.64275407
## maximum_nights review_scores_rating host_response_rate
## 0.51170019 0.20504798 0.04247158
# Create a subset of the dataset with the selected input variables and the target variable of binary price
data_subset <- air_clean_filtered %>%
select(c(selected_variables, "price_binary"))
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(selected_variables)
##
## # Now:
## data %>% select(all_of(selected_variables))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Set up the training control for 10-fold cross-validation
train_control <- trainControl(method = "cv", number = 10)
# Define the grid of k values to test
k_values <- data.frame(k = c(1, 3, 5, 7, 9, 11,13)) # Adjust as needed
# Fit the KNN model with different k values and perform 10-fold cross-validation
knn_model <- train(price_binary ~ ., data = data_subset, method = "knn", trControl = train_control, tuneGrid = k_values)
# Plot model accuracy vs. Number of Neighbors (K)
plot(knn_model, main = "Model Accuracy vs. Number of Neighbors (K)", xlab = "Number of Neighbors (K)", ylab = "Accuracy")
# Train KNN model using caret and evaluate performance
knn_model <- train( price_binary~ ., data = train_data, method = "knn", trControl = train_control)
knn_predictions <- predict(knn_model, test_data)
# Compute the confusion matrix
knn_metrics <- confusionMatrix(knn_predictions, test_data$price_binary)
print(knn_metrics)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Above Median Below Median
## Above Median 524 33
## Below Median 36 535
##
## Accuracy : 0.9388
## 95% CI : (0.9232, 0.9521)
## No Information Rate : 0.5035
## P-Value [Acc > NIR] : <0.0000000000000002
##
## Kappa : 0.8776
##
## Mcnemar's Test P-Value : 0.8097
##
## Sensitivity : 0.9357
## Specificity : 0.9419
## Pos Pred Value : 0.9408
## Neg Pred Value : 0.9370
## Prevalence : 0.4965
## Detection Rate : 0.4645
## Detection Prevalence : 0.4938
## Balanced Accuracy : 0.9388
##
## 'Positive' Class : Above Median
##
# Calculate precision and recall for the KNN model
knn_precision <- knn_metrics$byClass["Precision"]
knn_recall <- knn_metrics$byClass["Recall"]
print("KNN Metrics:")
## [1] "KNN Metrics:"
print(paste("Accuracy:", knn_metrics$overall["Accuracy"]))
## [1] "Accuracy: 0.938829787234043"
print(paste("Precision:", knn_precision))
## [1] "Precision: 0.940754039497307"
print(paste("Recall:", knn_recall))
## [1] "Recall: 0.935714285714286"
# Create a new data frame for the specific Airbnb listing
new_listing <- data.frame(
host_is_superhost = "f", # Assuming the host is not a superhost
host_response_time = "within a day", # Assuming response time within a day
host_response_rate = 0.95, # Assuming a 95% response rate
host_acceptance_rate = 0.97, # Assuming an 85% acceptance rate
accommodates = 4, # Number of people accommodated
bathrooms = 1, # Number of bathrooms
bedrooms = 2, # Number of bedrooms
neighbourhood_cleansed = "Kitsilano", # Neighborhood name
host_listings_count = 3, # Number of host listings
minimum_nights = 2, # Minimum nights required for booking
maximum_nights = 30, # Maximum nights allowed for booking
instant_bookable = "f", # Assuming instant booking is not available
host_identity_verified = "t", # Assuming host identity is verified
availability_30 = 25, # Availability in the next 30 days
availability_60 = 50, # Availability in the next 60 days
availability_90 = 70, # Availability in the next 90 days
review_scores_rating = 4.5, # Review scores rating
reviews_per_month = 2, # Reviews per month
has_availability = "t", # Assuming availability is true
availability_365 = 300, # Availability in the next 365 days
number_of_reviews = 50 # Total number of reviews
)
# Predict prices using all models and print them
print("Predicted Price from Model 1:")
## [1] "Predicted Price from Model 1:"
print(predict(step_model, newdata = new_listing))
## 1
## 212.4489
print("Predicted Price from Model 2:")
## [1] "Predicted Price from Model 2:"
print(predict(step_model2, newdata = new_listing))
## 1
## 214.1977
print("Predicted Price from Model 3:")
## [1] "Predicted Price from Model 3:"
print(predict(step_model3, newdata = new_listing))
## 1
## 221.2854
print("Predicted Price from Model 5:")
## [1] "Predicted Price from Model 5:"
print(predict(logstep_model1, newdata = new_listing))
## 1
## 212.4489
print("Predicted Price from Model 6:")
## [1] "Predicted Price from Model 6:"
print(predict(logstep_model2, newdata = new_listing))
## 1
## 225.0468
Distribution of price in different datasets
# Load the ggplot2 library
library(ggplot2)
# Create a histogram of the price distribution
ggplot(air_clean, aes(x = price)) +
geom_histogram(binwidth = 25, color = "black", fill = "skyblue", alpha = 0.8) +
labs(title = "Distribution of Price Model 1", x = "Price", y = "Frequency") +
theme_minimal()
# Create a histogram of the price distribution
ggplot(air_clean_sig, aes(x = price)) +
geom_histogram(binwidth = 25, color = "black", fill = "skyblue", alpha = 0.8) +
labs(title = "Distribution of Price Model 2", x = "Price", y = "Frequency") +
theme_minimal()
# Create a histogram of the price distribution
ggplot(air_clean_filtered, aes(x = price)) +
geom_histogram(binwidth = 25, color = "black", fill = "skyblue", alpha = 0.8) +
labs(title = "Distribution of Price Model 3", x = "Price", y = "Frequency") +
theme_minimal()